From 286f4b216cebd5445f1fa5749d9dd2dcc7e77a1c Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 24 Sep 2024 00:08:38 +0900
Subject: [PATCH 01/68] starter commit - ported time embeddings to keras ops

---
 keras_hub/src/models/flux/flux_maths.py | 45 +++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 keras_hub/src/models/flux/flux_maths.py

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
new file mode 100644
index 0000000000..e356027ccb
--- /dev/null
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -0,0 +1,45 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from keras import ops
+
+
+def timestep_embedding(
+    t, dim: int, max_period=10000, time_factor: float = 1000.0
+):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+
+    t = time_factor * t
+    half = dim // 2
+    freqs = ops.exp(
+        -math.log(max_period) * ops.arange(0, half, dtype=float) / half
+    )
+
+    args = t[:, None] * freqs[None]
+    embedding = ops.concatenate([ops.cos(args), ops.sin(args)], axis=-1)
+
+    if dim % 2:
+        embedding = ops.concatenate(
+            [embedding, ops.zeros_like(embedding[:, :1])], axis=-1
+        )
+
+    return embedding

From 244f0135e6ea0be781b2d01723e070f8245a2b8e Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 24 Sep 2024 00:18:12 +0900
Subject: [PATCH 02/68] add mlpembedder

---
 keras_hub/src/models/flux/flux_layers.py | 53 ++++++++++++++++++++++++
 keras_hub/src/models/flux/flux_maths.py  | 18 +++++---
 2 files changed, 65 insertions(+), 6 deletions(-)
 create mode 100644 keras_hub/src/models/flux/flux_layers.py

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
new file mode 100644
index 0000000000..478ac719f7
--- /dev/null
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -0,0 +1,53 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import keras
+from keras import layers
+from keras.layers import Layer
+
+
+class MLPEmbedder(keras.Model):
+    """
+    A simple multi-layer perceptron (MLP) embedder model.
+
+    This model applies a linear transformation followed by the SiLU activation
+    function and another linear transformation to the input tensor.
+    """
+
+    def __init__(self, hidden_dim: int):
+        """
+        Initializes the MLPEmbedder.
+
+        Args:
+            hidden_dim (int): The dimensionality of the hidden layer.
+        """
+        super().__init__()
+        self.in_layer = layers.Dense(hidden_dim, use_bias=True)
+        self.silu = layers.Activation("silu")
+        self.out_layer = layers.Dense(hidden_dim, use_bias=True)
+
+    def call(self, x: keras.Tensor) -> keras.Tensor:
+        """
+        Applies the MLP embedding to the input tensor.
+
+        Args:
+            x (keras.Tensor): Input tensor of shape (batch_size, in_dim).
+
+        Returns:
+            keras.Tensor: Output tensor of shape (batch_size, hidden_dim) after applying
+            the MLP transformations.
+        """
+        x = self.in_layer(x)
+        x = self.silu(x)
+        return self.out_layer(x)
diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index e356027ccb..248aa25313 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -20,12 +20,18 @@ def timestep_embedding(
     t, dim: int, max_period=10000, time_factor: float = 1000.0
 ):
     """
-    Create sinusoidal timestep embeddings.
-    :param t: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an (N, D) Tensor of positional embeddings.
+    Creates sinusoidal timestep embeddings.
+
+    Args:
+        t (keras.Tensor): A 1-D tensor of shape (N,), representing N indices, one per batch element.
+            These values may be fractional.
+        dim (int): The dimension of the output.
+        max_period (int, optional): Controls the minimum frequency of the embeddings. Defaults to 10000.
+        time_factor (float, optional): A scaling factor applied to `t`. Defaults to 1000.0.
+
+    Returns:
+        keras.Tensor: A tensor of shape (N, D) representing the positional embeddings,
+            where N is the number of batch elements and D is the specified dimension `dim`.
     """
 
     t = time_factor * t

From 480ad24c504b7815cf4301cdea567064c81d19e2 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 24 Sep 2024 00:29:39 +0900
Subject: [PATCH 03/68] add RMS Norm re-implementation

---
 keras_hub/src/models/flux/flux_layers.py | 39 +++++++++++++++++++++++-
 keras_hub/src/models/flux/flux_maths.py  |  1 +
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 478ac719f7..24a8a19b0f 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -14,7 +14,7 @@
 
 import keras
 from keras import layers
-from keras.layers import Layer
+from keras import ops
 
 
 class MLPEmbedder(keras.Model):
@@ -51,3 +51,40 @@ def call(self, x: keras.Tensor) -> keras.Tensor:
         x = self.in_layer(x)
         x = self.silu(x)
         return self.out_layer(x)
+
+
+# TODO: Maybe this can be exported as part of the public API? Seems to have enough reusability.
+class RMSNorm(keras.layers.Layer):
+    """
+    Root Mean Square (RMS) Normalization layer.
+
+    This layer normalizes the input tensor based on its RMS value and applies
+    a learned scaling factor.
+    """
+
+    def __init__(self, dim: int):
+        """
+        Initializes the RMSNorm layer.
+
+        Args:
+            dim (int): The dimensionality of the input tensor.
+        """
+        super().__init__()
+        self.scale = self.add_weight(
+            name="scale", shape=(dim,), initializer="ones"
+        )
+
+    def call(self, x: keras.Tensor) -> keras.Tensor:
+        """
+        Applies RMS normalization to the input tensor.
+
+        Args:
+            x (keras.Tensor): Input tensor of shape (batch_size, dim).
+
+        Returns:
+            keras.Tensor: The RMS-normalized tensor of the same shape (batch_size, dim),
+            scaled by the learned `scale` parameter.
+        """
+        x = ops.cast(x, float)
+        rrms = ops.rsqrt(ops.mean(ops.square(x), axis=-1, keepdims=True) + 1e-6)
+        return (x * rrms) * self.scale
diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 248aa25313..d8490aab49 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import math
+
 from keras import ops
 
 

From 2782242278eab6e1c483bc41fcb121007aeb16b5 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 24 Sep 2024 00:32:30 +0900
Subject: [PATCH 04/68] add qknorm reimplementation

---
 keras_hub/src/models/flux/flux_layers.py | 37 ++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 24a8a19b0f..e7a385321b 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -88,3 +88,40 @@ def call(self, x: keras.Tensor) -> keras.Tensor:
         x = ops.cast(x, float)
         rrms = ops.rsqrt(ops.mean(ops.square(x), axis=-1, keepdims=True) + 1e-6)
         return (x * rrms) * self.scale
+
+
+class QKNorm(keras.layers.Layer):
+    """
+    A layer that applies RMS normalization to query and key tensors.
+
+    This layer normalizes the input query and key tensors using separate RMSNorm
+    layers for each.
+    """
+
+    def __init__(self, dim: int):
+        """
+        Initializes the QKNorm layer.
+
+        Args:
+            dim (int): The dimensionality of the input query and key tensors.
+        """
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+
+    def call(
+        self, q: keras.Tensor, k: keras.Tensor
+    ) -> tuple[keras.Tensor, keras.Tensor]:
+        """
+        Applies RMS normalization to the query and key tensors.
+
+        Args:
+            q (keras.Tensor): The query tensor of shape (batch_size, dim).
+            k (keras.Tensor): The key tensor of shape (batch_size, dim).
+
+        Returns:
+            tuple[keras.Tensor, keras.Tensor]: A tuple containing the normalized query and key tensors.
+        """
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q, k

From 48c82e666ecb0d5461ac538621005639616a0a6d Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Thu, 26 Sep 2024 11:54:23 +0900
Subject: [PATCH 05/68] add rope, scaled dot product attention and self
 attention

---
 keras_hub/src/models/flux/flux_maths.py | 139 ++++++++++++++++++++++++
 1 file changed, 139 insertions(+)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index d8490aab49..bfe153f462 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -14,6 +14,8 @@
 
 import math
 
+import keras
+from einops import rearrange
 from keras import ops
 
 
@@ -50,3 +52,140 @@ def timestep_embedding(
         )
 
     return embedding
+
+
+def rope(pos, dim: int, theta: int):
+    """
+    Applies Rotary Positional Embedding (RoPE) to the input tensor.
+
+    Args:
+        pos (keras.Tensor): The positional tensor with shape (..., n, d).
+        dim (int): The embedding dimension, should be even.
+        theta (int): The base frequency.
+
+    Returns:
+        keras.Tensor: The tensor with applied RoPE transformation.
+    """
+    assert dim % 2 == 0
+    scale = ops.arange(0, dim, 2, dtype="float64") / dim
+    omega = 1.0 / (theta**scale)
+    out = ops.einsum("...n,d->...nd", pos, omega)
+    out = ops.stack(
+        [ops.cos(out), -ops.sin(out), ops.sin(out), ops.cos(out)], axis=-1
+    )
+    out = rearrange(out, "... n d (i j) -> ... n d i j", i=2, j=2)
+    return ops.cast(out, dtype="float32")
+
+
+def apply_rope(xq, xk, freqs_cis):
+    """
+    Applies the RoPE transformation to the query and key tensors using Keras operations.
+
+    Args:
+        xq (keras.Tensor): The query tensor of shape (..., L, D).
+        xk (keras.Tensor): The key tensor of shape (..., L, D).
+        freqs_cis (keras.Tensor): The frequency complex numbers tensor with shape (..., 2).
+
+    Returns:
+        tuple[keras.Tensor, keras.Tensor]: The transformed query and key tensors.
+    """
+    xq_ = ops.cast(xq, "float32")
+    xq_ = ops.reshape(xq_, (*xq_.shape[:-1], -1, 1, 2))
+
+    xk_ = ops.cast(xk, "float32")
+    xk_ = ops.reshape(xk_, (*xk_.shape[:-1], -1, 1, 2))
+
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+
+    return (ops.reshape(xq_out, xq.shape), ops.reshape(xk_out, xk.shape))
+
+
+def attention(q, k, v, pe, dropout_p=0.0, is_causal=False):
+    """
+    Computes the attention mechanism with the RoPE transformation applied to the query and key tensors.
+
+    Args:
+        q (keras.Tensor): Query tensor of shape (..., L, D).
+        k (keras.Tensor): Key tensor of shape (..., S, D).
+        v (keras.Tensor): Value tensor of shape (..., S, D).
+        pe (keras.Tensor): Positional encoding tensor.
+        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
+        is_causal (bool, optional): If True, applies causal masking. Defaults to False.
+
+    Returns:
+        keras.Tensor: The resulting tensor from the attention mechanism.
+    """
+    # Apply the RoPE transformation
+    q, k = apply_rope(q, k, pe)
+
+    # Calculate attention using the scaled dot product function
+    x = scaled_dot_product_attention(
+        q, k, v, dropout_p=dropout_p, is_causal=is_causal
+    )
+
+    # Reshape the output
+    x = ops.reshape(x, (ops.shape(x)[0], ops.shape(x)[1], -1))
+
+    return x
+
+
+# TODO: This is probably already implemented in several places, but is needed to ensure numeric equivalence to the original
+# implementation. It uses torch.functional.scaled_dot_product_attention() - do we have an equivalent already in Keras?
+def scaled_dot_product_attention(
+    query,
+    key,
+    value,
+    attn_mask=None,
+    dropout_p=0.0,
+    is_causal=False,
+    scale=None,
+):
+    """
+    Computes the scaled dot-product attention.
+
+    Args:
+        query (keras.Tensor): Query tensor of shape (..., L, D).
+        key (keras.Tensor): Key tensor of shape (..., S, D).
+        value (keras.Tensor): Value tensor of shape (..., S, D).
+        attn_mask (keras.Tensor, optional): Attention mask tensor. Defaults to None.
+        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
+        is_causal (bool, optional): If True, applies causal masking. Defaults to False.
+        scale (float, optional): Scale factor for attention. Defaults to None.
+
+    Returns:
+        keras.Tensor: The output tensor from the attention mechanism.
+    """
+    L, S = ops.shape(query)[-2], ops.shape(key)[-2]
+    scale_factor = (
+        1 / ops.sqrt(ops.cast(ops.shape(query)[-1], "float32"))
+        if scale is None
+        else scale
+    )
+    attn_bias = ops.zeros((L, S), dtype=query.dtype)
+
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = ops.ones((L, S), dtype=ops.bool)
+        temp_mask = ops.tril(temp_mask, diagonal=0)
+        attn_bias = ops.where(temp_mask, attn_bias, float("-inf"))
+
+    if attn_mask is not None:
+        if ops.shape(attn_mask)[-1] == 1:  # If the mask is 3D
+            attn_bias += attn_mask
+        else:
+            attn_bias = ops.where(attn_mask, attn_bias, float("-inf"))
+
+    # Compute attention weights
+    attn_weight = (
+        ops.matmul(query, ops.transpose(key, axes=[0, 1, 3, 2])) * scale_factor
+    )
+    attn_weight += attn_bias
+    attn_weight = keras.activations.softmax(attn_weight, axis=-1)
+
+    if dropout_p > 0.0:
+        attn_weight = keras.layers.Dropout(dropout_p)(
+            attn_weight, training=True
+        )
+
+    return ops.matmul(attn_weight, value)

From 513e37000fb47db2ddec4853068baeae9143587f Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 29 Sep 2024 21:02:30 +0900
Subject: [PATCH 06/68] modulation layer

---
 keras_hub/src/models/flux/flux_layers.py | 52 +++++++++++++++++++-----
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index e7a385321b..c93b26f93c 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from dataclasses import dataclass
+
 import keras
+from keras import KerasTensor
 from keras import layers
 from keras import ops
 
@@ -37,15 +40,15 @@ def __init__(self, hidden_dim: int):
         self.silu = layers.Activation("silu")
         self.out_layer = layers.Dense(hidden_dim, use_bias=True)
 
-    def call(self, x: keras.Tensor) -> keras.Tensor:
+    def call(self, x: KerasTensor) -> KerasTensor:
         """
         Applies the MLP embedding to the input tensor.
 
         Args:
-            x (keras.Tensor): Input tensor of shape (batch_size, in_dim).
+            x (KerasTensor): Input tensor of shape (batch_size, in_dim).
 
         Returns:
-            keras.Tensor: Output tensor of shape (batch_size, hidden_dim) after applying
+            KerasTensor: Output tensor of shape (batch_size, hidden_dim) after applying
             the MLP transformations.
         """
         x = self.in_layer(x)
@@ -74,15 +77,15 @@ def __init__(self, dim: int):
             name="scale", shape=(dim,), initializer="ones"
         )
 
-    def call(self, x: keras.Tensor) -> keras.Tensor:
+    def call(self, x: KerasTensor) -> KerasTensor:
         """
         Applies RMS normalization to the input tensor.
 
         Args:
-            x (keras.Tensor): Input tensor of shape (batch_size, dim).
+            x (KerasTensor): Input tensor of shape (batch_size, dim).
 
         Returns:
-            keras.Tensor: The RMS-normalized tensor of the same shape (batch_size, dim),
+            KerasTensor: The RMS-normalized tensor of the same shape (batch_size, dim),
             scaled by the learned `scale` parameter.
         """
         x = ops.cast(x, float)
@@ -110,18 +113,45 @@ def __init__(self, dim: int):
         self.key_norm = RMSNorm(dim)
 
     def call(
-        self, q: keras.Tensor, k: keras.Tensor
-    ) -> tuple[keras.Tensor, keras.Tensor]:
+        self, q: KerasTensor, k: KerasTensor
+    ) -> tuple[KerasTensor, KerasTensor]:
         """
         Applies RMS normalization to the query and key tensors.
 
         Args:
-            q (keras.Tensor): The query tensor of shape (batch_size, dim).
-            k (keras.Tensor): The key tensor of shape (batch_size, dim).
+            q (KerasTensor): The query tensor of shape (batch_size, dim).
+            k (KerasTensor): The key tensor of shape (batch_size, dim).
 
         Returns:
-            tuple[keras.Tensor, keras.Tensor]: A tuple containing the normalized query and key tensors.
+            tuple[KerasTensor, KerasTensor]: A tuple containing the normalized query and key tensors.
         """
         q = self.query_norm(q)
         k = self.key_norm(k)
         return q, k
+
+
+@dataclass
+class ModulationOut:
+    shift: KerasTensor
+    scale: KerasTensor
+    gate: KerasTensor
+
+
+class Modulation(keras.Model):
+    def __init__(self, dim, double):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = keras.layers.Dense(self.multiplier * dim, use_bias=True)
+
+    def call(self, x):
+        x = keras.layers.Activation("silu")(x)
+        out = self.lin(x)
+        out = keras.ops.split(
+            out[:, None, :], indices_or_sections=self.multiplier, axis=-1
+        )
+
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )

From 8ccbb2664f84a9db917a10c475fd5ade309478c4 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 29 Sep 2024 21:20:43 +0900
Subject: [PATCH 07/68] fix typing

---
 keras_hub/src/models/flux/flux_layers.py | 119 +++++++++++++++++++++++
 keras_hub/src/models/flux/flux_maths.py  |  37 +++----
 2 files changed, 138 insertions(+), 18 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index c93b26f93c..6fadd33fdf 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -15,10 +15,13 @@
 from dataclasses import dataclass
 
 import keras
+from einops import rearrange
 from keras import KerasTensor
 from keras import layers
 from keras import ops
 
+from keras_hub.src.models.flux.flux_maths import attention
+
 
 class MLPEmbedder(keras.Model):
     """
@@ -130,6 +133,27 @@ def call(
         return q, k
 
 
+class SelfAttention(keras.Model):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+
+        self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = layers.Dense(dim)
+
+    def call(self, x, pe):
+        qkv = self.qkv(x)
+        q, k, v = rearrange(
+            qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+
+
 @dataclass
 class ModulationOut:
     shift: KerasTensor
@@ -155,3 +179,98 @@ def call(self, x):
             ModulationOut(*out[:3]),
             ModulationOut(*out[3:]) if self.is_double else None,
         )
+
+
+class DoubleStreamBlock(keras.Model):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool = False,
+    ):
+        super().__init__()
+
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = keras.layers.LayerNormalization(
+            elementwise_affine=False, eps=1e-6
+        )
+        self.img_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+
+        self.img_norm2 = keras.layers.LayerNormalization(
+            elementwise_affine=False, eps=1e-6
+        )
+        self.img_mlp = keras.Sequential(
+            [
+                keras.layers.Dense(mlp_hidden_dim, use_bias=True),
+                keras.layers.Activation("tanh"),
+                keras.layers.Dense(hidden_size, use_bias=True),
+            ]
+        )
+
+        self.txt_mod = ModulationKeras(hidden_size, double=True)
+        self.txt_norm1 = keras.layers.LayerNormalization(
+            elementwise_affine=False, eps=1e-6
+        )
+        self.txt_attn = SelfAttentionKeras(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+
+        self.txt_norm2 = keras.layers.LayerNormalization(
+            elementwise_affine=False, eps=1e-6
+        )
+        self.txt_mlp = keras.Sequential(
+            [
+                keras.layers.Dense(mlp_hidden_dim, use_bias=True),
+                keras.layers.Activation("tanh"),
+                keras.layers.Dense(hidden_size, use_bias=True),
+            ]
+        )
+
+    def call(self, img, txt, vec, pe):
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(
+            img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(
+            txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+
+        # run actual attention
+        q = keras.ops.concatenate((txt_q, img_q), axis=2)
+        k = keras.ops.concatenate((txt_k, img_k), axis=2)
+        v = keras.ops.concatenate((txt_v, img_v), axis=2)
+
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp(
+            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        )
+
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp(
+            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        )
+        return img, txt
diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index bfe153f462..6eebcac260 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -16,6 +16,7 @@
 
 import keras
 from einops import rearrange
+from keras import KerasTensor
 from keras import ops
 
 
@@ -26,14 +27,14 @@ def timestep_embedding(
     Creates sinusoidal timestep embeddings.
 
     Args:
-        t (keras.Tensor): A 1-D tensor of shape (N,), representing N indices, one per batch element.
+        t (KerasTensor): A 1-D tensor of shape (N,), representing N indices, one per batch element.
             These values may be fractional.
         dim (int): The dimension of the output.
         max_period (int, optional): Controls the minimum frequency of the embeddings. Defaults to 10000.
         time_factor (float, optional): A scaling factor applied to `t`. Defaults to 1000.0.
 
     Returns:
-        keras.Tensor: A tensor of shape (N, D) representing the positional embeddings,
+        KerasTensor: A tensor of shape (N, D) representing the positional embeddings,
             where N is the number of batch elements and D is the specified dimension `dim`.
     """
 
@@ -59,12 +60,12 @@ def rope(pos, dim: int, theta: int):
     Applies Rotary Positional Embedding (RoPE) to the input tensor.
 
     Args:
-        pos (keras.Tensor): The positional tensor with shape (..., n, d).
+        pos (KerasTensor): The positional tensor with shape (..., n, d).
         dim (int): The embedding dimension, should be even.
         theta (int): The base frequency.
 
     Returns:
-        keras.Tensor: The tensor with applied RoPE transformation.
+        KerasTensor: The tensor with applied RoPE transformation.
     """
     assert dim % 2 == 0
     scale = ops.arange(0, dim, 2, dtype="float64") / dim
@@ -82,12 +83,12 @@ def apply_rope(xq, xk, freqs_cis):
     Applies the RoPE transformation to the query and key tensors using Keras operations.
 
     Args:
-        xq (keras.Tensor): The query tensor of shape (..., L, D).
-        xk (keras.Tensor): The key tensor of shape (..., L, D).
-        freqs_cis (keras.Tensor): The frequency complex numbers tensor with shape (..., 2).
+        xq (KerasTensor): The query tensor of shape (..., L, D).
+        xk (KerasTensor): The key tensor of shape (..., L, D).
+        freqs_cis (KerasTensor): The frequency complex numbers tensor with shape (..., 2).
 
     Returns:
-        tuple[keras.Tensor, keras.Tensor]: The transformed query and key tensors.
+        tuple[KerasTensor, KerasTensor]: The transformed query and key tensors.
     """
     xq_ = ops.cast(xq, "float32")
     xq_ = ops.reshape(xq_, (*xq_.shape[:-1], -1, 1, 2))
@@ -106,15 +107,15 @@ def attention(q, k, v, pe, dropout_p=0.0, is_causal=False):
     Computes the attention mechanism with the RoPE transformation applied to the query and key tensors.
 
     Args:
-        q (keras.Tensor): Query tensor of shape (..., L, D).
-        k (keras.Tensor): Key tensor of shape (..., S, D).
-        v (keras.Tensor): Value tensor of shape (..., S, D).
-        pe (keras.Tensor): Positional encoding tensor.
+        q (KerasTensor): Query tensor of shape (..., L, D).
+        k (KerasTensor): Key tensor of shape (..., S, D).
+        v (KerasTensor): Value tensor of shape (..., S, D).
+        pe (KerasTensor): Positional encoding tensor.
         dropout_p (float, optional): Dropout probability. Defaults to 0.0.
         is_causal (bool, optional): If True, applies causal masking. Defaults to False.
 
     Returns:
-        keras.Tensor: The resulting tensor from the attention mechanism.
+        KerasTensor: The resulting tensor from the attention mechanism.
     """
     # Apply the RoPE transformation
     q, k = apply_rope(q, k, pe)
@@ -145,16 +146,16 @@ def scaled_dot_product_attention(
     Computes the scaled dot-product attention.
 
     Args:
-        query (keras.Tensor): Query tensor of shape (..., L, D).
-        key (keras.Tensor): Key tensor of shape (..., S, D).
-        value (keras.Tensor): Value tensor of shape (..., S, D).
-        attn_mask (keras.Tensor, optional): Attention mask tensor. Defaults to None.
+        query (KerasTensor): Query tensor of shape (..., L, D).
+        key (KerasTensor): Key tensor of shape (..., S, D).
+        value (KerasTensor): Value tensor of shape (..., S, D).
+        attn_mask (KerasTensor, optional): Attention mask tensor. Defaults to None.
         dropout_p (float, optional): Dropout probability. Defaults to 0.0.
         is_causal (bool, optional): If True, applies causal masking. Defaults to False.
         scale (float, optional): Scale factor for attention. Defaults to None.
 
     Returns:
-        keras.Tensor: The output tensor from the attention mechanism.
+        KerasTensor: The output tensor from the attention mechanism.
     """
     L, S = ops.shape(query)[-2], ops.shape(key)[-2]
     scale_factor = (

From c88c949c10919e9b13cfa9019fca1e2be18730b7 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 29 Sep 2024 22:09:29 +0900
Subject: [PATCH 08/68] add double stream block

---
 keras_hub/src/models/flux/flux_layers.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 6fadd33fdf..d988c2b241 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -148,7 +148,7 @@ def call(self, x, pe):
         q, k, v = rearrange(
             qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
         )
-        q, k = self.norm(q, k, v)
+        q, k = self.norm(q, k)
         x = attention(q, k, v, pe=pe)
         x = self.proj(x)
         return x
@@ -213,11 +213,11 @@ def __init__(
             ]
         )
 
-        self.txt_mod = ModulationKeras(hidden_size, double=True)
+        self.txt_mod = Modulation(hidden_size, double=True)
         self.txt_norm1 = keras.layers.LayerNormalization(
             elementwise_affine=False, eps=1e-6
         )
-        self.txt_attn = SelfAttentionKeras(
+        self.txt_attn = SelfAttention(
             dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
         )
 

From 2bc150e11944f0a5f0930e7e55721d943a485429 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 30 Sep 2024 10:21:56 +0900
Subject: [PATCH 09/68] adjustments to doublestreamblock

---
 keras_hub/src/models/flux/flux_layers.py | 80 ++++++++----------------
 1 file changed, 27 insertions(+), 53 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index d988c2b241..6e98dd2955 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -181,56 +181,36 @@ def call(self, x):
         )
 
 
+
 class DoubleStreamBlock(keras.Model):
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float,
-        qkv_bias: bool = False,
-    ):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
         super().__init__()
 
         mlp_hidden_dim = int(hidden_size * mlp_ratio)
         self.num_heads = num_heads
         self.hidden_size = hidden_size
+        
         self.img_mod = Modulation(hidden_size, double=True)
-        self.img_norm1 = keras.layers.LayerNormalization(
-            elementwise_affine=False, eps=1e-6
-        )
-        self.img_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
-        )
+        self.img_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
 
-        self.img_norm2 = keras.layers.LayerNormalization(
-            elementwise_affine=False, eps=1e-6
-        )
-        self.img_mlp = keras.Sequential(
-            [
-                keras.layers.Dense(mlp_hidden_dim, use_bias=True),
-                keras.layers.Activation("tanh"),
-                keras.layers.Dense(hidden_size, use_bias=True),
-            ]
-        )
+        self.img_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.img_mlp = keras.Sequential([
+            keras.layers.Dense(mlp_hidden_dim, use_bias=True),
+            keras.layers.Activation("tanh"),
+            keras.layers.Dense(hidden_size, use_bias=True)
+        ])
 
         self.txt_mod = Modulation(hidden_size, double=True)
-        self.txt_norm1 = keras.layers.LayerNormalization(
-            elementwise_affine=False, eps=1e-6
-        )
-        self.txt_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
-        )
+        self.txt_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
 
-        self.txt_norm2 = keras.layers.LayerNormalization(
-            elementwise_affine=False, eps=1e-6
-        )
-        self.txt_mlp = keras.Sequential(
-            [
-                keras.layers.Dense(mlp_hidden_dim, use_bias=True),
-                keras.layers.Activation("tanh"),
-                keras.layers.Dense(hidden_size, use_bias=True),
-            ]
-        )
+        self.txt_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.txt_mlp = keras.Sequential([
+            keras.layers.Dense(mlp_hidden_dim, use_bias=True),
+            keras.layers.Activation("tanh"),
+            keras.layers.Dense(hidden_size, use_bias=True)
+        ])
 
     def call(self, img, txt, vec, pe):
         img_mod1, img_mod2 = self.img_mod(vec)
@@ -240,19 +220,15 @@ def call(self, img, txt, vec, pe):
         img_modulated = self.img_norm1(img)
         img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
         img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(
-            img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
-        )
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k)
 
         # prepare txt for attention
         txt_modulated = self.txt_norm1(txt)
         txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
         txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(
-            txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
-        )
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k)
 
         # run actual attention
         q = keras.ops.concatenate((txt_q, img_q), axis=2)
@@ -264,13 +240,11 @@ def call(self, img, txt, vec, pe):
 
         # calculate the img bloks
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp(
-            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
-        )
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
 
         # calculate the txt bloks
         txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt = txt + txt_mod2.gate * self.txt_mlp(
-            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
-        )
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
         return img, txt
+    
+

From 969d508bb0508482adad4fbe2c1c1a8b47c13dfd Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Wed, 2 Oct 2024 22:51:20 +0900
Subject: [PATCH 10/68] add signle stream layer@

---
 keras_hub/src/models/flux/flux_layers.py | 115 +++++++++++++++++++----
 keras_hub/src/models/flux/flux_maths.py  |   8 +-
 2 files changed, 102 insertions(+), 21 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 6e98dd2955..da7459ea95 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -181,36 +181,49 @@ def call(self, x):
         )
 
 
-
 class DoubleStreamBlock(keras.Model):
-    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+        qkv_bias: bool = False,
+    ):
         super().__init__()
 
         mlp_hidden_dim = int(hidden_size * mlp_ratio)
         self.num_heads = num_heads
         self.hidden_size = hidden_size
-        
+
         self.img_mod = Modulation(hidden_size, double=True)
         self.img_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
-        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
 
         self.img_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
-        self.img_mlp = keras.Sequential([
-            keras.layers.Dense(mlp_hidden_dim, use_bias=True),
-            keras.layers.Activation("tanh"),
-            keras.layers.Dense(hidden_size, use_bias=True)
-        ])
+        self.img_mlp = keras.Sequential(
+            [
+                keras.layers.Dense(mlp_hidden_dim, use_bias=True),
+                keras.layers.Activation("tanh"),
+                keras.layers.Dense(hidden_size, use_bias=True),
+            ]
+        )
 
         self.txt_mod = Modulation(hidden_size, double=True)
         self.txt_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
-        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
 
         self.txt_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
-        self.txt_mlp = keras.Sequential([
-            keras.layers.Dense(mlp_hidden_dim, use_bias=True),
-            keras.layers.Activation("tanh"),
-            keras.layers.Dense(hidden_size, use_bias=True)
-        ])
+        self.txt_mlp = keras.Sequential(
+            [
+                keras.layers.Dense(mlp_hidden_dim, use_bias=True),
+                keras.layers.Activation("tanh"),
+                keras.layers.Dense(hidden_size, use_bias=True),
+            ]
+        )
 
     def call(self, img, txt, vec, pe):
         img_mod1, img_mod2 = self.img_mod(vec)
@@ -220,14 +233,18 @@ def call(self, img, txt, vec, pe):
         img_modulated = self.img_norm1(img)
         img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
         img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k, img_v = rearrange(
+            img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
         img_q, img_k = self.img_attn.norm(img_q, img_k)
 
         # prepare txt for attention
         txt_modulated = self.txt_norm1(txt)
         txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
         txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k, txt_v = rearrange(
+            txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
         txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k)
 
         # run actual attention
@@ -240,11 +257,69 @@ def call(self, img, txt, vec, pe):
 
         # calculate the img bloks
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        img = img + img_mod2.gate * self.img_mlp(
+            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        )
 
         # calculate the txt bloks
         txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        txt = txt + txt_mod2.gate * self.txt_mlp(
+            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        )
         return img, txt
-    
 
+
+class SingleStreamBlock(keras.Model):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = keras.layers.Dense(hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = keras.layers.Dense(hidden_size)
+
+        self.norm = QKNorm(head_dim)
+
+        self.hidden_size = hidden_size
+        self.pre_norm = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.modulation = Modulation(hidden_size, double=False)
+
+    def call(self, x, vec, pe):
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = keras.ops.split(
+            self.linear1(x_mod), [3 * self.hidden_size], axis=-1
+        )
+
+        q, k, v = rearrange(
+            qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        q, k = self.norm(q, k)
+        print(q.shape, k.shape, v.shape, pe.shape)
+
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        print(mlp.shape, attn.shape)
+        output = self.linear2(
+            keras.ops.concatenate(
+                (attn, keras.activations.gelu(mlp, approximate=True)), 2
+            )
+        )
+        return x + mod.gate * output
diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 6eebcac260..4f7589a9c3 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -126,7 +126,13 @@ def attention(q, k, v, pe, dropout_p=0.0, is_causal=False):
     )
 
     # Reshape the output
-    x = ops.reshape(x, (ops.shape(x)[0], ops.shape(x)[1], -1))
+    B, H, L, D = (
+        ops.shape(x)[0],
+        ops.shape(x)[1],
+        ops.shape(x)[2],
+        ops.shape(x)[3],
+    )
+    x = ops.reshape(x, (B, L, H * D))
 
     return x
 

From 77c9297707b23df2169682fb6d9c16f7187f654e Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Fri, 4 Oct 2024 10:20:30 +0900
Subject: [PATCH 11/68] update layers and add flux core model

---
 keras_hub/src/models/flux/__init__.py    |   0
 keras_hub/src/models/flux/flux_layers.py |  42 ++++++++
 keras_hub/src/models/flux/flux_maths.py  |   1 -
 keras_hub/src/models/flux/flux_model.py  | 126 +++++++++++++++++++++++
 4 files changed, 168 insertions(+), 1 deletion(-)
 create mode 100644 keras_hub/src/models/flux/__init__.py
 create mode 100644 keras_hub/src/models/flux/flux_model.py

diff --git a/keras_hub/src/models/flux/__init__.py b/keras_hub/src/models/flux/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index da7459ea95..13b3fb9c64 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -21,6 +21,27 @@
 from keras import ops
 
 from keras_hub.src.models.flux.flux_maths import attention
+from keras_hub.src.models.flux.flux_maths import rope
+
+
+class EmbedND(keras.Model):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def call(self, ids):
+        n_axes = ids.shape[-1]
+        emb = keras.ops.concatenate(
+            [
+                rope(ids[..., i], self.axes_dim[i], self.theta)
+                for i in range(n_axes)
+            ],
+            dim=-3,
+        )
+
+        return emb.unsqueeze(1)
 
 
 class MLPEmbedder(keras.Model):
@@ -323,3 +344,24 @@ def call(self, x, vec, pe):
             )
         )
         return x + mod.gate * output
+
+
+class LastLayer(keras.Model):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.linear = keras.layers.Dense(
+            patch_size * patch_size * out_channels, use_bias=True
+        )
+        self.adaLN_modulation = keras.Sequential(
+            [
+                keras.layers.Activation("silu"),
+                keras.layers.Dense(2 * hidden_size, use_bias=True),
+            ]
+        )
+
+    def call(self, x, vec):
+        shift, scale = keras.ops.split(self.adaLN_modulation(vec), 2, axis=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 4f7589a9c3..b27d63ce78 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -16,7 +16,6 @@
 
 import keras
 from einops import rearrange
-from keras import KerasTensor
 from keras import ops
 
 
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
new file mode 100644
index 0000000000..eda1e20909
--- /dev/null
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -0,0 +1,126 @@
+from dataclasses import dataclass
+
+import keras
+
+from keras_hub.src.models.flux.flux_layers import DoubleStreamBlock
+from keras_hub.src.models.flux.flux_layers import EmbedND
+from keras_hub.src.models.flux.flux_layers import LastLayer
+from keras_hub.src.models.flux.flux_layers import MLPEmbedder
+from keras_hub.src.models.flux.flux_layers import SingleStreamBlock
+from keras_hub.src.models.flux.flux_maths import timestep_embedding
+
+
+@dataclass
+class FluxParams:
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+
+
+class Flux(keras.Model):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, params: FluxParams):
+        super().__init__()
+
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim
+        )
+        self.img_in = keras.layers.Dense(self.hidden_size, use_bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+            if params.guidance_embed
+            else keras.layers.Identity()
+        )
+        self.txt_in = keras.layers.Dense(self.hidden_size)
+
+        self.double_blocks = [
+            DoubleStreamBlock(
+                self.hidden_size,
+                self.num_heads,
+                mlp_ratio=params.mlp_ratio,
+                qkv_bias=params.qkv_bias,
+            )
+            for _ in range(params.depth)
+        ]
+
+        self.single_blocks = [
+            SingleStreamBlock(
+                self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio
+            )
+            for _ in range(params.depth_single_blocks)
+        ]
+
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+
+    def call(
+        self,
+        img,
+        img_ids,
+        txt,
+        txt_ids,
+        timesteps,
+        y,
+        guidance=None,
+    ):
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError(
+                "Input img and txt tensors must have 3 dimensions."
+            )
+
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+
+        ids = keras.ops.concatenate((txt_ids, img_ids), axis=1)
+        pe = self.pe_embedder(ids)
+
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+
+        img = keras.ops.concatenate((txt, img), axis=1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+
+        print("img, vec", img.shape, vec.shape)
+
+        img = self.final_layer(
+            img, vec
+        )  # (N, T, patch_size ** 2 * out_channels)
+        return img

From 35769ab993f969c45a3bd8415f636a2a029531c6 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sat, 5 Oct 2024 16:15:55 +0900
Subject: [PATCH 12/68] functions to layers

---
 keras_hub/src/models/flux/flux_maths.py | 153 ++++++++++++++----------
 keras_hub/src/models/flux/flux_model.py |  71 +++++------
 2 files changed, 122 insertions(+), 102 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index b27d63ce78..7f313334ef 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -1,5 +1,3 @@
-# Copyright 2024 The KerasHub Authors
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,76 +10,91 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
-
 import keras
 from einops import rearrange
 from keras import ops
 
 
-def timestep_embedding(
-    t, dim: int, max_period=10000, time_factor: float = 1000.0
-):
+class TimestepEmbedding(keras.layers.Layer):
     """
     Creates sinusoidal timestep embeddings.
 
     Args:
-        t (KerasTensor): A 1-D tensor of shape (N,), representing N indices, one per batch element.
-            These values may be fractional.
         dim (int): The dimension of the output.
         max_period (int, optional): Controls the minimum frequency of the embeddings. Defaults to 10000.
         time_factor (float, optional): A scaling factor applied to `t`. Defaults to 1000.0.
 
+    Call Args:
+        t (KerasTensor): A 1-D tensor of shape (N,), representing N indices, one per batch element.
+                         These values may be fractional.
+
     Returns:
         KerasTensor: A tensor of shape (N, D) representing the positional embeddings,
-            where N is the number of batch elements and D is the specified dimension `dim`.
+                     where N is the number of batch elements and D is the specified dimension `dim`.
     """
 
-    t = time_factor * t
-    half = dim // 2
-    freqs = ops.exp(
-        -math.log(max_period) * ops.arange(0, half, dtype=float) / half
-    )
-
-    args = t[:, None] * freqs[None]
-    embedding = ops.concatenate([ops.cos(args), ops.sin(args)], axis=-1)
-
-    if dim % 2:
-        embedding = ops.concatenate(
-            [embedding, ops.zeros_like(embedding[:, :1])], axis=-1
+    def __init__(self, dim, max_period=10000, time_factor=1000.0):
+        super(TimestepEmbedding, self).__init__()
+        self.dim = dim
+        self.max_period = max_period
+        self.time_factor = time_factor
+
+    def call(self, t):
+        t = self.time_factor * t
+        half_dim = self.dim // 2
+        freqs = ops.exp(
+            -ops.log(self.max_period)
+            * ops.arange(half_dim, dtype="float32")
+            / half_dim
         )
+        args = t[:, None] * freqs[None]
+        embedding = ops.concatenate([ops.cos(args), ops.sin(args)], axis=-1)
+
+        if self.dim % 2 != 0:
+            embedding = ops.concatenate(
+                [embedding, ops.zeros_like(embedding[:, :1])], axis=-1
+            )
 
-    return embedding
+        return embedding
 
 
-def rope(pos, dim: int, theta: int):
+class RotaryPositionalEmbedding(keras.layers.Layer):
     """
     Applies Rotary Positional Embedding (RoPE) to the input tensor.
 
     Args:
-        pos (KerasTensor): The positional tensor with shape (..., n, d).
         dim (int): The embedding dimension, should be even.
         theta (int): The base frequency.
 
+    Call Args:
+        pos (KerasTensor): The positional tensor with shape (..., n, d).
+
     Returns:
         KerasTensor: The tensor with applied RoPE transformation.
     """
-    assert dim % 2 == 0
-    scale = ops.arange(0, dim, 2, dtype="float64") / dim
-    omega = 1.0 / (theta**scale)
-    out = ops.einsum("...n,d->...nd", pos, omega)
-    out = ops.stack(
-        [ops.cos(out), -ops.sin(out), ops.sin(out), ops.cos(out)], axis=-1
-    )
-    out = rearrange(out, "... n d (i j) -> ... n d i j", i=2, j=2)
-    return ops.cast(out, dtype="float32")
+
+    def __init__(self, dim, theta):
+        super(RotaryPositionalEmbedding, self).__init__()
+        assert dim % 2 == 0
+        self.dim = dim
+        self.theta = theta
+
+    def call(self, pos):
+        scale = ops.arange(0, self.dim, 2, dtype="float32") / self.dim
+        omega = 1.0 / (self.theta**scale)
+        out = ops.einsum("...n,d->...nd", pos, omega)
+        out = ops.stack(
+            [ops.cos(out), -ops.sin(out), ops.sin(out), ops.cos(out)], axis=-1
+        )
+        out = rearrange(out, "... n d (i j) -> ... n d i j", i=2, j=2)
+        return ops.cast(out, dtype="float32")
 
 
-def apply_rope(xq, xk, freqs_cis):
+class ApplyRoPE(keras.layers.Layer):
     """
-    Applies the RoPE transformation to the query and key tensors using Keras operations.
+    Applies the RoPE transformation to the query and key tensors.
 
-    Args:
+    Call Args:
         xq (KerasTensor): The query tensor of shape (..., L, D).
         xk (KerasTensor): The key tensor of shape (..., L, D).
         freqs_cis (KerasTensor): The frequency complex numbers tensor with shape (..., 2).
@@ -89,51 +102,67 @@ def apply_rope(xq, xk, freqs_cis):
     Returns:
         tuple[KerasTensor, KerasTensor]: The transformed query and key tensors.
     """
-    xq_ = ops.cast(xq, "float32")
-    xq_ = ops.reshape(xq_, (*xq_.shape[:-1], -1, 1, 2))
 
-    xk_ = ops.cast(xk, "float32")
-    xk_ = ops.reshape(xk_, (*xk_.shape[:-1], -1, 1, 2))
+    def call(self, xq, xk, freqs_cis):
+        xq_ = ops.reshape(
+            ops.cast(xq, "float32"), (*ops.shape(xq)[:-1], -1, 1, 2)
+        )
+        xk_ = ops.reshape(
+            ops.cast(xk, "float32"), (*ops.shape(xk)[:-1], -1, 1, 2)
+        )
 
-    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        xq_out = (
+            freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        )
+        xk_out = (
+            freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        )
 
-    return (ops.reshape(xq_out, xq.shape), ops.reshape(xk_out, xk.shape))
+        return ops.reshape(xq_out, ops.shape(xq)), ops.reshape(
+            xk_out, ops.shape(xk)
+        )
 
 
-def attention(q, k, v, pe, dropout_p=0.0, is_causal=False):
+class FluxRoPEAttention(keras.layers.Layer):
     """
     Computes the attention mechanism with the RoPE transformation applied to the query and key tensors.
 
     Args:
+        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
+        is_causal (bool, optional): If True, applies causal masking. Defaults to False.
+
+    Call Args:
         q (KerasTensor): Query tensor of shape (..., L, D).
         k (KerasTensor): Key tensor of shape (..., S, D).
         v (KerasTensor): Value tensor of shape (..., S, D).
         pe (KerasTensor): Positional encoding tensor.
-        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
-        is_causal (bool, optional): If True, applies causal masking. Defaults to False.
 
     Returns:
         KerasTensor: The resulting tensor from the attention mechanism.
     """
-    # Apply the RoPE transformation
-    q, k = apply_rope(q, k, pe)
 
-    # Calculate attention using the scaled dot product function
-    x = scaled_dot_product_attention(
-        q, k, v, dropout_p=dropout_p, is_causal=is_causal
-    )
+    def __init__(self, dropout_p=0.0, is_causal=False):
+        super(FluxRoPEAttention, self).__init__()
+        self.dropout_p = dropout_p
+        self.is_causal = is_causal
 
-    # Reshape the output
-    B, H, L, D = (
-        ops.shape(x)[0],
-        ops.shape(x)[1],
-        ops.shape(x)[2],
-        ops.shape(x)[3],
-    )
-    x = ops.reshape(x, (B, L, H * D))
+    def call(self, q, k, v, pe):
+        # Apply the RoPE transformation
+        q, k = ApplyRoPE()(q, k, pe)
+
+        # Scaled dot-product attention
+        x = scaled_dot_product_attention(
+            q, k, v, dropout_p=self.dropout_p, is_causal=self.is_causal
+        )
 
-    return x
+        # Reshape the output
+        B, H, L, D = (
+            ops.shape(x)[0],
+            ops.shape(x)[1],
+            ops.shape(x)[2],
+            ops.shape(x)[3],
+        )
+        return ops.reshape(x, (B, L, H * D))
 
 
 # TODO: This is probably already implemented in several places, but is needed to ensure numeric equivalence to the original
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index eda1e20909..dd12f53ec1 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -1,5 +1,3 @@
-from dataclasses import dataclass
-
 import keras
 
 from keras_hub.src.models.flux.flux_layers import DoubleStreamBlock
@@ -10,53 +8,48 @@
 from keras_hub.src.models.flux.flux_maths import timestep_embedding
 
 
-@dataclass
-class FluxParams:
-    in_channels: int
-    vec_in_dim: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    depth_single_blocks: int
-    axes_dim: list[int]
-    theta: int
-    qkv_bias: bool
-    guidance_embed: bool
-
-
 class Flux(keras.Model):
     """
     Transformer model for flow matching on sequences.
     """
 
-    def __init__(self, params: FluxParams):
+    def __init__(
+        self,
+        in_channels: int,
+        vec_in_dim: int,
+        context_in_dim: int,
+        hidden_size: int,
+        mlp_ratio: float,
+        num_heads: int,
+        depth: int,
+        depth_single_blocks: int,
+        axes_dim: list[int],
+        theta: int,
+        qkv_bias: bool,
+        guidance_embed: bool,
+    ):
         super().__init__()
 
-        self.params = params
-        self.in_channels = params.in_channels
+        self.in_channels = in_channels
         self.out_channels = self.in_channels
-        if params.hidden_size % params.num_heads != 0:
+        if hidden_size % num_heads != 0:
             raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
             )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
+        pe_dim = hidden_size // num_heads
+        if sum(axes_dim) != pe_dim:
             raise ValueError(
-                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
+                f"Got {axes_dim} but expected positional dim {pe_dim}"
             )
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.pe_embedder = EmbedND(
-            dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim
-        )
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
         self.img_in = keras.layers.Dense(self.hidden_size, use_bias=True)
         self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
         self.guidance_in = (
             MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-            if params.guidance_embed
+            if guidance_embed
             else keras.layers.Identity()
         )
         self.txt_in = keras.layers.Dense(self.hidden_size)
@@ -65,17 +58,17 @@ def __init__(self, params: FluxParams):
             DoubleStreamBlock(
                 self.hidden_size,
                 self.num_heads,
-                mlp_ratio=params.mlp_ratio,
-                qkv_bias=params.qkv_bias,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
             )
-            for _ in range(params.depth)
+            for _ in range(depth)
         ]
 
         self.single_blocks = [
             SingleStreamBlock(
-                self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio
+                self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio
             )
-            for _ in range(params.depth_single_blocks)
+            for _ in range(depth_single_blocks)
         ]
 
         self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
@@ -118,8 +111,6 @@ def call(
             img = block(img, vec=vec, pe=pe)
         img = img[:, txt.shape[1] :, ...]
 
-        print("img, vec", img.shape, vec.shape)
-
         img = self.final_layer(
             img, vec
         )  # (N, T, patch_size ** 2 * out_channels)

From 13d46c40d4a0bbd1ab3a635479f6a6204ad729fa Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sat, 5 Oct 2024 16:18:01 +0900
Subject: [PATCH 13/68] refactor layer usage

---
 keras_hub/src/models/flux/flux_layers.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 13b3fb9c64..085718477b 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -20,8 +20,8 @@
 from keras import layers
 from keras import ops
 
-from keras_hub.src.models.flux.flux_maths import attention
-from keras_hub.src.models.flux.flux_maths import rope
+from keras_hub.src.models.flux.flux_maths import FluxRoPEAttention
+from keras_hub.src.models.flux.flux_maths import RotaryPositionalEmbedding
 
 
 class EmbedND(keras.Model):
@@ -30,12 +30,13 @@ def __init__(self, dim: int, theta: int, axes_dim: list[int]):
         self.dim = dim
         self.theta = theta
         self.axes_dim = axes_dim
+        self.rope = RotaryPositionalEmbedding()
 
     def call(self, ids):
         n_axes = ids.shape[-1]
         emb = keras.ops.concatenate(
             [
-                rope(ids[..., i], self.axes_dim[i], self.theta)
+                self.rope(ids[..., i], self.axes_dim[i], self.theta)
                 for i in range(n_axes)
             ],
             dim=-3,
@@ -163,6 +164,7 @@ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
         self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias)
         self.norm = QKNorm(head_dim)
         self.proj = layers.Dense(dim)
+        self.attention = FluxRoPEAttention()
 
     def call(self, x, pe):
         qkv = self.qkv(x)
@@ -170,7 +172,7 @@ def call(self, x, pe):
             qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
         )
         q, k = self.norm(q, k)
-        x = attention(q, k, v, pe=pe)
+        x = self.attention(q, k, v, pe=pe)
         x = self.proj(x)
         return x
 
@@ -245,6 +247,7 @@ def __init__(
                 keras.layers.Dense(hidden_size, use_bias=True),
             ]
         )
+        self.attention = FluxRoPEAttention()
 
     def call(self, img, txt, vec, pe):
         img_mod1, img_mod2 = self.img_mod(vec)
@@ -273,7 +276,7 @@ def call(self, img, txt, vec, pe):
         k = keras.ops.concatenate((txt_k, img_k), axis=2)
         v = keras.ops.concatenate((txt_v, img_v), axis=2)
 
-        attn = attention(q, k, v, pe=pe)
+        attn = self.attention(q, k, v, pe=pe)
         txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
 
         # calculate the img bloks
@@ -320,6 +323,7 @@ def __init__(
         self.hidden_size = hidden_size
         self.pre_norm = keras.layers.LayerNormalization(epsilon=1e-6)
         self.modulation = Modulation(hidden_size, double=False)
+        self.attention = FluxRoPEAttention()
 
     def call(self, x, vec, pe):
         mod, _ = self.modulation(vec)
@@ -335,9 +339,8 @@ def call(self, x, vec, pe):
         print(q.shape, k.shape, v.shape, pe.shape)
 
         # compute attention
-        attn = attention(q, k, v, pe=pe)
+        attn = self.attention(q, k, v, pe=pe)
         # compute activation in mlp stream, cat again and run second linear layer
-        print(mlp.shape, attn.shape)
         output = self.linear2(
             keras.ops.concatenate(
                 (attn, keras.activations.gelu(mlp, approximate=True)), 2

From c00c6a57a548000ff0c969f889196e74056682d0 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sat, 5 Oct 2024 16:18:47 +0900
Subject: [PATCH 14/68] refactor layer usage

---
 keras_hub/src/models/flux/flux_model.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index dd12f53ec1..694a2411e3 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -5,7 +5,7 @@
 from keras_hub.src.models.flux.flux_layers import LastLayer
 from keras_hub.src.models.flux.flux_layers import MLPEmbedder
 from keras_hub.src.models.flux.flux_layers import SingleStreamBlock
-from keras_hub.src.models.flux.flux_maths import timestep_embedding
+from keras_hub.src.models.flux.flux_maths import TimestepEmbedding
 
 
 class Flux(keras.Model):
@@ -72,6 +72,7 @@ def __init__(
         ]
 
         self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.timestep_embedding = TimestepEmbedding()
 
     def call(
         self,
@@ -90,13 +91,13 @@ def call(
 
         # running on sequences img
         img = self.img_in(img)
-        vec = self.time_in(timestep_embedding(timesteps, 256))
+        vec = self.time_in(self.timestep_embedding(timesteps, 256))
         if self.params.guidance_embed:
             if guidance is None:
                 raise ValueError(
                     "Didn't get guidance strength for guidance distilled model."
                 )
-            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+            vec = vec + self.guidance_in(self.timestep_embedding(guidance, 256))
         vec = vec + self.vector_in(y)
         txt = self.txt_in(txt)
 

From 05a1e3fe617a8d7d0c56d8d30f9099a5a498a475 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sat, 5 Oct 2024 16:27:02 +0900
Subject: [PATCH 15/68] position math args in call()

---
 keras_hub/src/models/flux/flux_maths.py | 40 ++++++++-----------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 7f313334ef..5fcdb88816 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -19,38 +19,31 @@ class TimestepEmbedding(keras.layers.Layer):
     """
     Creates sinusoidal timestep embeddings.
 
-    Args:
-        dim (int): The dimension of the output.
-        max_period (int, optional): Controls the minimum frequency of the embeddings. Defaults to 10000.
-        time_factor (float, optional): A scaling factor applied to `t`. Defaults to 1000.0.
 
     Call Args:
         t (KerasTensor): A 1-D tensor of shape (N,), representing N indices, one per batch element.
                          These values may be fractional.
+        dim (int): The dimension of the output.
+        max_period (int, optional): Controls the minimum frequency of the embeddings. Defaults to 10000.
+        time_factor (float, optional): A scaling factor applied to `t`. Defaults to 1000.0.
 
     Returns:
         KerasTensor: A tensor of shape (N, D) representing the positional embeddings,
                      where N is the number of batch elements and D is the specified dimension `dim`.
     """
 
-    def __init__(self, dim, max_period=10000, time_factor=1000.0):
-        super(TimestepEmbedding, self).__init__()
-        self.dim = dim
-        self.max_period = max_period
-        self.time_factor = time_factor
-
-    def call(self, t):
-        t = self.time_factor * t
-        half_dim = self.dim // 2
+    def call(self, t, dim, max_period=10000, time_factor=1000.0):
+        t = time_factor * t
+        half_dim = dim // 2
         freqs = ops.exp(
-            -ops.log(self.max_period)
+            -ops.log(max_period)
             * ops.arange(half_dim, dtype="float32")
             / half_dim
         )
         args = t[:, None] * freqs[None]
         embedding = ops.concatenate([ops.cos(args), ops.sin(args)], axis=-1)
 
-        if self.dim % 2 != 0:
+        if dim % 2 != 0:
             embedding = ops.concatenate(
                 [embedding, ops.zeros_like(embedding[:, :1])], axis=-1
             )
@@ -62,26 +55,19 @@ class RotaryPositionalEmbedding(keras.layers.Layer):
     """
     Applies Rotary Positional Embedding (RoPE) to the input tensor.
 
-    Args:
-        dim (int): The embedding dimension, should be even.
-        theta (int): The base frequency.
 
     Call Args:
         pos (KerasTensor): The positional tensor with shape (..., n, d).
+        dim (int): The embedding dimension, should be even.
+        theta (int): The base frequency.
 
     Returns:
         KerasTensor: The tensor with applied RoPE transformation.
     """
 
-    def __init__(self, dim, theta):
-        super(RotaryPositionalEmbedding, self).__init__()
-        assert dim % 2 == 0
-        self.dim = dim
-        self.theta = theta
-
-    def call(self, pos):
-        scale = ops.arange(0, self.dim, 2, dtype="float32") / self.dim
-        omega = 1.0 / (self.theta**scale)
+    def call(self, pos, dim, theta):
+        scale = ops.arange(0, dim, 2, dtype="float32") / dim
+        omega = 1.0 / (theta**scale)
         out = ops.einsum("...n,d->...nd", pos, omega)
         out = ops.stack(
             [ops.cos(out), -ops.sin(out), ops.sin(out), ops.cos(out)], axis=-1

From f076006b64397dbb30094d9d45a3d2a637fa01f0 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sat, 5 Oct 2024 16:54:20 +0900
Subject: [PATCH 16/68] name arguments

---
 keras_hub/src/models/flux/flux_layers.py | 9 ++++-----
 keras_hub/src/models/flux/flux_model.py  | 6 ++++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 085718477b..c62811bb20 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -36,7 +36,7 @@ def call(self, ids):
         n_axes = ids.shape[-1]
         emb = keras.ops.concatenate(
             [
-                self.rope(ids[..., i], self.axes_dim[i], self.theta)
+                self.rope(ids[..., i], dim=self.axes_dim[i], theta=self.theta)
                 for i in range(n_axes)
             ],
             dim=-3,
@@ -172,7 +172,7 @@ def call(self, x, pe):
             qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
         )
         q, k = self.norm(q, k)
-        x = self.attention(q, k, v, pe=pe)
+        x = self.attention(q=q, k=k, v=v, pe=pe)
         x = self.proj(x)
         return x
 
@@ -276,7 +276,7 @@ def call(self, img, txt, vec, pe):
         k = keras.ops.concatenate((txt_k, img_k), axis=2)
         v = keras.ops.concatenate((txt_v, img_v), axis=2)
 
-        attn = self.attention(q, k, v, pe=pe)
+        attn = self.attention(q=q, k=k, v=v, pe=pe)
         txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
 
         # calculate the img bloks
@@ -336,10 +336,9 @@ def call(self, x, vec, pe):
             qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
         )
         q, k = self.norm(q, k)
-        print(q.shape, k.shape, v.shape, pe.shape)
 
         # compute attention
-        attn = self.attention(q, k, v, pe=pe)
+        attn = self.attention(q, k=k, v=v, pe=pe)
         # compute activation in mlp stream, cat again and run second linear layer
         output = self.linear2(
             keras.ops.concatenate(
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 694a2411e3..66d6b2891e 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -91,13 +91,15 @@ def call(
 
         # running on sequences img
         img = self.img_in(img)
-        vec = self.time_in(self.timestep_embedding(timesteps, 256))
+        vec = self.time_in(self.timestep_embedding(timesteps, dim=256))
         if self.params.guidance_embed:
             if guidance is None:
                 raise ValueError(
                     "Didn't get guidance strength for guidance distilled model."
                 )
-            vec = vec + self.guidance_in(self.timestep_embedding(guidance, 256))
+            vec = vec + self.guidance_in(
+                self.timestep_embedding(guidance, dim=256)
+            )
         vec = vec + self.vector_in(y)
         txt = self.txt_in(txt)
 

From f9fc4a42b796860fa478eefe184a901f63964318 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sat, 5 Oct 2024 16:57:57 +0900
Subject: [PATCH 17/68] fix arg name

---
 keras_hub/src/models/flux/flux_layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index c62811bb20..41ce58d70a 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -39,10 +39,10 @@ def call(self, ids):
                 self.rope(ids[..., i], dim=self.axes_dim[i], theta=self.theta)
                 for i in range(n_axes)
             ],
-            dim=-3,
+            axis=-3,
         )
 
-        return emb.unsqueeze(1)
+        return keras.ops.expand_dims(emb, axis=1)
 
 
 class MLPEmbedder(keras.Model):

From f2f2c967f4f7d698629bb823c64114ed33df0117 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sat, 5 Oct 2024 17:24:56 +0900
Subject: [PATCH 18/68] start adding conversion script utils

---
 keras_hub/src/models/flux/convert_weights.py | 44 ++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 keras_hub/src/models/flux/convert_weights.py

diff --git a/keras_hub/src/models/flux/convert_weights.py b/keras_hub/src/models/flux/convert_weights.py
new file mode 100644
index 0000000000..b184080748
--- /dev/null
+++ b/keras_hub/src/models/flux/convert_weights.py
@@ -0,0 +1,44 @@
+def convert_mlpembedder_weights(pytorch_model, keras_model):
+    """
+    Convert weights from PyTorch MLPEmbedder to Keras MLPEmbedderKeras.
+    """
+    pytorch_in_layer_weight = (
+        pytorch_model.in_layer.weight.detach().cpu().numpy()
+    )
+    pytorch_in_layer_bias = pytorch_model.in_layer.bias.detach().cpu().numpy()
+
+    pytorch_out_layer_weight = (
+        pytorch_model.out_layer.weight.detach().cpu().numpy()
+    )
+    pytorch_out_layer_bias = pytorch_model.out_layer.bias.detach().cpu().numpy()
+
+    keras_model.in_layer.set_weights(
+        [pytorch_in_layer_weight.T, pytorch_in_layer_bias]
+    )
+    keras_model.out_layer.set_weights(
+        [pytorch_out_layer_weight.T, pytorch_out_layer_bias]
+    )
+
+
+def convert_selfattention_weights(pytorch_model, keras_model):
+    """
+    Convert weights from PyTorch SelfAttention to Keras SelfAttentionKeras.
+    """
+
+    # Extract PyTorch weights
+    pytorch_qkv_weight = pytorch_model.qkv.weight.detach().cpu().numpy()
+    pytorch_qkv_bias = (
+        pytorch_model.qkv.bias.detach().cpu().numpy()
+        if pytorch_model.qkv.bias is not None
+        else None
+    )
+
+    pytorch_proj_weight = pytorch_model.proj.weight.detach().cpu().numpy()
+    pytorch_proj_bias = pytorch_model.proj.bias.detach().cpu().numpy()
+
+    # Set Keras weights (Dense layers use [weight, bias] format)
+    keras_model.qkv.set_weights(
+        [pytorch_qkv_weight.T]
+        + ([pytorch_qkv_bias] if pytorch_qkv_bias is not None else [])
+    )
+    keras_model.proj.set_weights([pytorch_proj_weight.T, pytorch_proj_bias])

From 311d34211d021d00ab986cdf483695864d6a96ab Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 13:12:01 +0900
Subject: [PATCH 19/68] change reshape into rearrange

---
 keras_hub/src/models/flux/flux_maths.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 5fcdb88816..357f42ab33 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -148,7 +148,8 @@ def call(self, q, k, v, pe):
             ops.shape(x)[2],
             ops.shape(x)[3],
         )
-        return ops.reshape(x, (B, L, H * D))
+        x = rearrange(x, "B H L D -> B L (H D)")
+        return x
 
 
 # TODO: This is probably already implemented in several places, but is needed to ensure numeric equivalence to the original

From db14c01195855f784dcb4bc8e128c5041c81ada2 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 19:34:56 +0900
Subject: [PATCH 20/68] add rest of weight conversion and remove redundant
 shape extraction

---
 keras_hub/src/models/flux/convert_weights.py | 143 +++++++++++++++++--
 keras_hub/src/models/flux/flux_maths.py      |   7 -
 2 files changed, 135 insertions(+), 15 deletions(-)

diff --git a/keras_hub/src/models/flux/convert_weights.py b/keras_hub/src/models/flux/convert_weights.py
index b184080748..0cb02850da 100644
--- a/keras_hub/src/models/flux/convert_weights.py
+++ b/keras_hub/src/models/flux/convert_weights.py
@@ -1,7 +1,4 @@
 def convert_mlpembedder_weights(pytorch_model, keras_model):
-    """
-    Convert weights from PyTorch MLPEmbedder to Keras MLPEmbedderKeras.
-    """
     pytorch_in_layer_weight = (
         pytorch_model.in_layer.weight.detach().cpu().numpy()
     )
@@ -21,11 +18,7 @@ def convert_mlpembedder_weights(pytorch_model, keras_model):
 
 
 def convert_selfattention_weights(pytorch_model, keras_model):
-    """
-    Convert weights from PyTorch SelfAttention to Keras SelfAttentionKeras.
-    """
 
-    # Extract PyTorch weights
     pytorch_qkv_weight = pytorch_model.qkv.weight.detach().cpu().numpy()
     pytorch_qkv_bias = (
         pytorch_model.qkv.bias.detach().cpu().numpy()
@@ -36,9 +29,143 @@ def convert_selfattention_weights(pytorch_model, keras_model):
     pytorch_proj_weight = pytorch_model.proj.weight.detach().cpu().numpy()
     pytorch_proj_bias = pytorch_model.proj.bias.detach().cpu().numpy()
 
-    # Set Keras weights (Dense layers use [weight, bias] format)
     keras_model.qkv.set_weights(
         [pytorch_qkv_weight.T]
         + ([pytorch_qkv_bias] if pytorch_qkv_bias is not None else [])
     )
     keras_model.proj.set_weights([pytorch_proj_weight.T, pytorch_proj_bias])
+
+
+def convert_modulation_weights(pytorch_model, keras_model):
+    pytorch_weight = pytorch_model.lin.weight.detach().cpu().numpy()
+    pytorch_bias = pytorch_model.lin.bias.detach().cpu().numpy()
+
+    keras_model.lin.set_weights([pytorch_weight.T, pytorch_bias])
+
+
+def convert_doublestreamblock_weights(pytorch_model, keras_model):
+    # Convert img_mod weights
+    convert_modulation_weights(pytorch_model.img_mod, keras_model.img_mod)
+
+    # Convert txt_mod weights
+    convert_modulation_weights(pytorch_model.txt_mod, keras_model.txt_mod)
+
+    # Convert img_attn weights
+    convert_selfattention_weights(pytorch_model.img_attn, keras_model.img_attn)
+
+    # Convert txt_attn weights
+    convert_selfattention_weights(pytorch_model.txt_attn, keras_model.txt_attn)
+
+    # Convert img_mlp weights (2 Linear layers in PyTorch -> 2 Dense layers in Keras)
+    keras_model.img_mlp.layers[0].set_weights(
+        [
+            pytorch_model.img_mlp[0].weight.detach().cpu().numpy().T,
+            pytorch_model.img_mlp[0].bias.detach().cpu().numpy(),
+        ]
+    )
+    keras_model.img_mlp.layers[2].set_weights(
+        [
+            pytorch_model.img_mlp[2].weight.detach().cpu().numpy().T,
+            pytorch_model.img_mlp[2].bias.detach().cpu().numpy(),
+        ]
+    )
+
+    # Convert txt_mlp weights (2 Linear layers in PyTorch -> 2 Dense layers in Keras)
+    keras_model.txt_mlp.layers[0].set_weights(
+        [
+            pytorch_model.txt_mlp[0].weight.detach().cpu().numpy().T,
+            pytorch_model.txt_mlp[0].bias.detach().cpu().numpy(),
+        ]
+    )
+    keras_model.txt_mlp.layers[2].set_weights(
+        [
+            pytorch_model.txt_mlp[2].weight.detach().cpu().numpy().T,
+            pytorch_model.txt_mlp[2].bias.detach().cpu().numpy(),
+        ]
+    )
+
+
+def convert_singlestreamblock_weights(pytorch_model, keras_model):
+    convert_modulation_weights(pytorch_model.modulation, keras_model.modulation)
+
+    # Convert linear1 (Dense) weights
+    keras_model.linear1.set_weights(
+        [
+            pytorch_model.linear1.weight.detach().cpu().numpy().T,
+            pytorch_model.linear1.bias.detach().cpu().numpy(),
+        ]
+    )
+
+    # Convert linear2 (Dense) weights
+    keras_model.linear2.set_weights(
+        [
+            pytorch_model.linear2.weight.detach().cpu().numpy().T,
+            pytorch_model.linear2.bias.detach().cpu().numpy(),
+        ]
+    )
+
+
+def convert_lastlayer_weights(pytorch_model, keras_model):
+
+    # Convert linear (Dense) weights
+    keras_model.linear.set_weights(
+        [
+            pytorch_model.linear.weight.detach().cpu().numpy().T,
+            pytorch_model.linear.bias.detach().cpu().numpy(),
+        ]
+    )
+
+    # Convert adaLN_modulation (Sequential) weights
+    keras_model.adaLN_modulation.layers[1].set_weights(
+        [
+            pytorch_model.adaLN_modulation[1].weight.detach().cpu().numpy().T,
+            pytorch_model.adaLN_modulation[1].bias.detach().cpu().numpy(),
+        ]
+    )
+
+
+def convert_flux_weights(pytorch_model, keras_model):
+    # Convert img_in (Dense) weights
+    keras_model.img_in.set_weights(
+        [
+            pytorch_model.img_in.weight.detach().cpu().numpy().T,
+            pytorch_model.img_in.bias.detach().cpu().numpy(),
+        ]
+    )
+
+    # Convert time_in (MLPEmbedder) weights
+    convert_mlpembedder_weights(pytorch_model.time_in, keras_model.time_in)
+
+    # Convert vector_in (MLPEmbedder) weights
+    convert_mlpembedder_weights(pytorch_model.vector_in, keras_model.vector_in)
+
+    # Convert guidance_in (if present)
+    if keras_model.params.guidance_embed:
+        convert_mlpembedder_weights(
+            pytorch_model.guidance_in, keras_model.guidance_in
+        )
+
+    # Convert txt_in (Dense) weights
+    keras_model.txt_in.set_weights(
+        [
+            pytorch_model.txt_in.weight.detach().cpu().numpy().T,
+            pytorch_model.txt_in.bias.detach().cpu().numpy(),
+        ]
+    )
+
+    # Convert double_blocks (DoubleStreamBlock) weights
+    for pt_block, keras_block in zip(
+        pytorch_model.double_blocks, keras_model.double_blocks
+    ):
+        convert_doublestreamblock_weights(pt_block, keras_block)
+
+    # Convert single_blocks (SingleStreamBlock) weights
+    for pt_block, keras_block in zip(
+        pytorch_model.single_blocks, keras_model.single_blocks
+    ):
+        convert_singlestreamblock_weights(pt_block, keras_block)
+
+    # Convert final_layer (LastLayer) weights
+    convert_lastlayer_weights(
+        pytorch_model.final_layer, keras_model.final_layer
+    )
diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 357f42ab33..f344a919b9 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -141,13 +141,6 @@ def call(self, q, k, v, pe):
             q, k, v, dropout_p=self.dropout_p, is_causal=self.is_causal
         )
 
-        # Reshape the output
-        B, H, L, D = (
-            ops.shape(x)[0],
-            ops.shape(x)[1],
-            ops.shape(x)[2],
-            ops.shape(x)[3],
-        )
         x = rearrange(x, "B H L D -> B L (H D)")
         return x
 

From c5b37c6b4cbd7d2d6ad5f9fdfad791e6a5bd4105 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 19:52:28 +0900
Subject: [PATCH 21/68] fix mlpembedder arg

---
 keras_hub/src/models/flux/flux_model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 66d6b2891e..069f3e8d82 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -45,10 +45,10 @@ def __init__(
         self.num_heads = num_heads
         self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
         self.img_in = keras.layers.Dense(self.hidden_size, use_bias=True)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.time_in = MLPEmbedder(hidden_dim=self.hidden_size)
         self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
         self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+            MLPEmbedder(hidden_dim=self.hidden_size)
             if guidance_embed
             else keras.layers.Identity()
         )

From 8d3a385873c7146134e00780eb228c76feab5282 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 20:09:23 +0900
Subject: [PATCH 22/68] remove redundant args

---
 keras_hub/src/models/flux/flux_model.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 069f3e8d82..f1baebef21 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -16,8 +16,6 @@ class Flux(keras.Model):
     def __init__(
         self,
         in_channels: int,
-        vec_in_dim: int,
-        context_in_dim: int,
         hidden_size: int,
         mlp_ratio: float,
         num_heads: int,
@@ -46,7 +44,7 @@ def __init__(
         self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
         self.img_in = keras.layers.Dense(self.hidden_size, use_bias=True)
         self.time_in = MLPEmbedder(hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(vec_in_dim, self.hidden_size)
+        self.vector_in = MLPEmbedder(hidden_dim=self.hidden_size)
         self.guidance_in = (
             MLPEmbedder(hidden_dim=self.hidden_size)
             if guidance_embed

From fa5379e73e2b296cd90d1a3a16729251439f45a9 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 20:19:58 +0900
Subject: [PATCH 23/68] fix params. to self.

---
 keras_hub/src/models/flux/flux_model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index f1baebef21..ffa89958ba 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -71,6 +71,7 @@ def __init__(
 
         self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
         self.timestep_embedding = TimestepEmbedding()
+        self.guidance_embed = guidance_embed
 
     def call(
         self,
@@ -90,7 +91,7 @@ def call(
         # running on sequences img
         img = self.img_in(img)
         vec = self.time_in(self.timestep_embedding(timesteps, dim=256))
-        if self.params.guidance_embed:
+        if self.guidance_embed:
             if guidance is None:
                 raise ValueError(
                     "Didn't get guidance strength for guidance distilled model."

From 34e2477cbbb0eb208a40f462bd402a3bcd8df9c6 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 20:51:22 +0900
Subject: [PATCH 24/68] add license

---
 keras_hub/src/models/flux/convert_weights.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/keras_hub/src/models/flux/convert_weights.py b/keras_hub/src/models/flux/convert_weights.py
index 0cb02850da..dc3e124956 100644
--- a/keras_hub/src/models/flux/convert_weights.py
+++ b/keras_hub/src/models/flux/convert_weights.py
@@ -1,3 +1,16 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 def convert_mlpembedder_weights(pytorch_model, keras_model):
     pytorch_in_layer_weight = (
         pytorch_model.in_layer.weight.detach().cpu().numpy()
@@ -140,7 +153,7 @@ def convert_flux_weights(pytorch_model, keras_model):
     convert_mlpembedder_weights(pytorch_model.vector_in, keras_model.vector_in)
 
     # Convert guidance_in (if present)
-    if keras_model.params.guidance_embed:
+    if keras_model.guidance_embed:
         convert_mlpembedder_weights(
             pytorch_model.guidance_in, keras_model.guidance_in
         )

From cdd397a61f43f9a8dbcfb7f4199392b53a665778 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 20:51:29 +0900
Subject: [PATCH 25/68] add einops

---
 requirements-common.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-common.txt b/requirements-common.txt
index 2bdc4a5720..b21dc49b1f 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,3 +19,4 @@ sentencepiece
 tensorflow-datasets
 safetensors
 pillow
+einops

From 8169aa49f397a95ace0c730e0de7b90560c500a6 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 21:42:47 +0900
Subject: [PATCH 26/68] fix default arg

---
 keras_hub/src/models/flux/convert_weights.py |  2 +-
 keras_hub/src/models/flux/flux_layers.py     | 74 +++++++++++++++++++-
 2 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/keras_hub/src/models/flux/convert_weights.py b/keras_hub/src/models/flux/convert_weights.py
index dc3e124956..70a7b0046c 100644
--- a/keras_hub/src/models/flux/convert_weights.py
+++ b/keras_hub/src/models/flux/convert_weights.py
@@ -68,7 +68,7 @@ def convert_doublestreamblock_weights(pytorch_model, keras_model):
 
     # Convert txt_attn weights
     convert_selfattention_weights(pytorch_model.txt_attn, keras_model.txt_attn)
-
+    
     # Convert img_mlp weights (2 Linear layers in PyTorch -> 2 Dense layers in Keras)
     keras_model.img_mlp.layers[0].set_weights(
         [
diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 41ce58d70a..367b9e9fe4 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -25,7 +25,21 @@
 
 
 class EmbedND(keras.Model):
+    """
+    Embedding layer for N-dimensional inputs using Rotary Positional Embedding (RoPE).
+
+    This layer applies RoPE embeddings across multiple axes of the input tensor and 
+    concatenates the embeddings along a specified axis.
+    """
     def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        """
+        Initializes the EmbedND layer.
+
+        Args:
+            dim (int): Dimensionality of the embedding.
+            theta (int): Rotational angle parameter for RoPE.
+            axes_dim (list[int]): Dimensionality for each axis of the input tensor.
+        """
         super().__init__()
         self.dim = dim
         self.theta = theta
@@ -33,6 +47,15 @@ def __init__(self, dim: int, theta: int, axes_dim: list[int]):
         self.rope = RotaryPositionalEmbedding()
 
     def call(self, ids):
+        """
+        Computes the positional embeddings for each axis and concatenates them.
+
+        Args:
+            ids (KerasTensor): Input tensor of shape (..., num_axes).
+
+        Returns:
+            KerasTensor: Positional embeddings of shape (..., concatenated_dim, 1, ...).
+        """
         n_axes = ids.shape[-1]
         emb = keras.ops.concatenate(
             [
@@ -156,7 +179,22 @@ def call(
 
 
 class SelfAttention(keras.Model):
+    """
+    Multi-head self-attention layer with RoPE embeddings and RMS normalization.
+
+    This layer performs self-attention over the input sequence and applies RMS 
+    normalization to the query and key tensors before computing the attention scores.
+    """
     def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        """
+        Initializes the SelfAttention layer.
+
+        Args:
+            dim (int): Dimensionality of the input tensor.
+            num_heads (int): Number of attention heads. Default is 8.
+            qkv_bias (bool): Whether to use bias in the query, key, value projection layers.
+                Default is False.
+        """
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
@@ -167,6 +205,16 @@ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
         self.attention = FluxRoPEAttention()
 
     def call(self, x, pe):
+        """
+        Applies self-attention with RoPE embeddings.
+
+        Args:
+            x (KerasTensor): Input tensor of shape (batch_size, seq_len, dim).
+            pe (KerasTensor): Positional encoding tensor for RoPE.
+
+        Returns:
+            KerasTensor: Output tensor after self-attention and projection.
+        """
         qkv = self.qkv(x)
         q, k, v = rearrange(
             qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
@@ -185,13 +233,37 @@ class ModulationOut:
 
 
 class Modulation(keras.Model):
+    """
+    Modulation layer that produces shift, scale, and gate tensors.
+
+    This layer applies a SiLU activation to the input tensor followed by a linear 
+    transformation to generate modulation parameters. It can optionally generate two 
+    sets of modulation parameters.
+    """
     def __init__(self, dim, double):
+        """
+        Initializes the Modulation layer.
+
+        Args:
+            dim (int): Dimensionality of the modulation output.
+            double (bool): Whether to generate two sets of modulation parameters.
+        """
         super().__init__()
         self.is_double = double
         self.multiplier = 6 if double else 3
         self.lin = keras.layers.Dense(self.multiplier * dim, use_bias=True)
 
     def call(self, x):
+        """
+        Generates modulation parameters from the input tensor.
+
+        Args:
+            x (KerasTensor): Input tensor.
+
+        Returns:
+            tuple[ModulationOut, ModulationOut | None]: A tuple containing the shift, 
+            scale, and gate tensors. If `double` is True, returns two sets of modulation parameters.
+        """
         x = keras.layers.Activation("silu")(x)
         out = self.lin(x)
         out = keras.ops.split(
@@ -304,7 +376,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         mlp_ratio: float = 4.0,
-        qk_scale: float | None = None,
+        qk_scale: float = None,
     ):
         super().__init__()
         self.hidden_dim = hidden_size

From b1caa7faebdf921a15be781ce944f79f6e66fd42 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 21:46:03 +0900
Subject: [PATCH 27/68] expand docstrings

---
 keras_hub/src/models/flux/convert_weights.py |  2 +-
 keras_hub/src/models/flux/flux_layers.py     | 72 ++++++++++++++++++--
 2 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/keras_hub/src/models/flux/convert_weights.py b/keras_hub/src/models/flux/convert_weights.py
index 70a7b0046c..dc3e124956 100644
--- a/keras_hub/src/models/flux/convert_weights.py
+++ b/keras_hub/src/models/flux/convert_weights.py
@@ -68,7 +68,7 @@ def convert_doublestreamblock_weights(pytorch_model, keras_model):
 
     # Convert txt_attn weights
     convert_selfattention_weights(pytorch_model.txt_attn, keras_model.txt_attn)
-    
+
     # Convert img_mlp weights (2 Linear layers in PyTorch -> 2 Dense layers in Keras)
     keras_model.img_mlp.layers[0].set_weights(
         [
diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 367b9e9fe4..5d8974d178 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -28,9 +28,10 @@ class EmbedND(keras.Model):
     """
     Embedding layer for N-dimensional inputs using Rotary Positional Embedding (RoPE).
 
-    This layer applies RoPE embeddings across multiple axes of the input tensor and 
+    This layer applies RoPE embeddings across multiple axes of the input tensor and
     concatenates the embeddings along a specified axis.
     """
+
     def __init__(self, dim: int, theta: int, axes_dim: list[int]):
         """
         Initializes the EmbedND layer.
@@ -182,9 +183,10 @@ class SelfAttention(keras.Model):
     """
     Multi-head self-attention layer with RoPE embeddings and RMS normalization.
 
-    This layer performs self-attention over the input sequence and applies RMS 
+    This layer performs self-attention over the input sequence and applies RMS
     normalization to the query and key tensors before computing the attention scores.
     """
+
     def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
         """
         Initializes the SelfAttention layer.
@@ -236,10 +238,11 @@ class Modulation(keras.Model):
     """
     Modulation layer that produces shift, scale, and gate tensors.
 
-    This layer applies a SiLU activation to the input tensor followed by a linear 
-    transformation to generate modulation parameters. It can optionally generate two 
+    This layer applies a SiLU activation to the input tensor followed by a linear
+    transformation to generate modulation parameters. It can optionally generate two
     sets of modulation parameters.
     """
+
     def __init__(self, dim, double):
         """
         Initializes the Modulation layer.
@@ -261,7 +264,7 @@ def call(self, x):
             x (KerasTensor): Input tensor.
 
         Returns:
-            tuple[ModulationOut, ModulationOut | None]: A tuple containing the shift, 
+            tuple[ModulationOut, ModulationOut | None]: A tuple containing the shift,
             scale, and gate tensors. If `double` is True, returns two sets of modulation parameters.
         """
         x = keras.layers.Activation("silu")(x)
@@ -277,6 +280,17 @@ def call(self, x):
 
 
 class DoubleStreamBlock(keras.Model):
+    """
+    A block that processes image and text inputs in parallel using
+    self-attention and MLP layers, with modulation.
+
+    Args:
+        hidden_size (int): The hidden dimension size for the model.
+        num_heads (int): The number of attention heads.
+        mlp_ratio (float): The ratio of the MLP hidden dimension to the hidden size.
+        qkv_bias (bool, optional): Whether to include bias in QKV projection. Default is False.
+    """
+
     def __init__(
         self,
         hidden_size: int,
@@ -322,6 +336,18 @@ def __init__(
         self.attention = FluxRoPEAttention()
 
     def call(self, img, txt, vec, pe):
+        """
+        Forward pass for the DoubleStreamBlock.
+
+        Args:
+            img (KerasTensor): Input image tensor.
+            txt (KerasTensor): Input text tensor.
+            vec (KerasTensor): Modulation vector.
+            pe (KerasTensor): Positional encoding tensor.
+
+        Returns:
+            Tuple[KerasTensor, KerasTensor]: The modified image and text tensors.
+        """
         img_mod1, img_mod2 = self.img_mod(vec)
         txt_mod1, txt_mod2 = self.txt_mod(vec)
 
@@ -369,6 +395,12 @@ class SingleStreamBlock(keras.Model):
     """
     A DiT block with parallel linear layers as described in
     https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+
+    Args:
+        hidden_size (int): The hidden dimension size for the model.
+        num_heads (int): The number of attention heads.
+        mlp_ratio (float, optional): The ratio of the MLP hidden dimension to the hidden size. Default is 4.0.
+        qk_scale (float, optional): Scaling factor for the query-key product. Default is None.
     """
 
     def __init__(
@@ -398,6 +430,17 @@ def __init__(
         self.attention = FluxRoPEAttention()
 
     def call(self, x, vec, pe):
+        """
+        Forward pass for the SingleStreamBlock.
+
+        Args:
+            x (KerasTensor): Input tensor.
+            vec (KerasTensor): Modulation vector.
+            pe (KerasTensor): Positional encoding tensor.
+
+        Returns:
+            KerasTensor: The modified input tensor after processing.
+        """
         mod, _ = self.modulation(vec)
         x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
         qkv, mlp = keras.ops.split(
@@ -421,6 +464,15 @@ def call(self, x, vec, pe):
 
 
 class LastLayer(keras.Model):
+    """
+    Final layer for processing output tensors with adaptive normalization.
+
+    Args:
+        hidden_size (int): The hidden dimension size for the model.
+        patch_size (int): The size of each patch.
+        out_channels (int): The number of output channels.
+    """
+
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
         super().__init__()
         self.norm_final = keras.layers.LayerNormalization(epsilon=1e-6)
@@ -435,6 +487,16 @@ def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
         )
 
     def call(self, x, vec):
+        """
+        Forward pass for the LastLayer.
+
+        Args:
+            x (KerasTensor): Input tensor.
+            vec (KerasTensor): Modulation vector.
+
+        Returns:
+            KerasTensor: The output tensor after final processing.
+        """
         shift, scale = keras.ops.split(self.adaLN_modulation(vec), 2, axis=1)
         x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
         x = self.linear(x)

From 76eae8382ff4341e2a0a62247b4caee2018f3762 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 21:51:11 +0900
Subject: [PATCH 28/68] tanh to gelu

---
 keras_hub/src/models/flux/flux_layers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 5d8974d178..57767c9950 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -314,7 +314,7 @@ def __init__(
         self.img_mlp = keras.Sequential(
             [
                 keras.layers.Dense(mlp_hidden_dim, use_bias=True),
-                keras.layers.Activation("tanh"),
+                keras.layers.Activation("gelu"),
                 keras.layers.Dense(hidden_size, use_bias=True),
             ]
         )
@@ -329,7 +329,7 @@ def __init__(
         self.txt_mlp = keras.Sequential(
             [
                 keras.layers.Dense(mlp_hidden_dim, use_bias=True),
-                keras.layers.Activation("tanh"),
+                keras.layers.Activation("gelu"),
                 keras.layers.Dense(hidden_size, use_bias=True),
             ]
         )

From c0236acfbb9fa26f4a8e6ffd085543778761adf9 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Sun, 6 Oct 2024 22:34:02 +0900
Subject: [PATCH 29/68] refactor weight conversion into tools

---
 .../convert_flux_checkpoints.py               | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 rename keras_hub/src/models/flux/convert_weights.py => tools/checkpoint_conversion/convert_flux_checkpoints.py (92%)

diff --git a/keras_hub/src/models/flux/convert_weights.py b/tools/checkpoint_conversion/convert_flux_checkpoints.py
similarity index 92%
rename from keras_hub/src/models/flux/convert_weights.py
rename to tools/checkpoint_conversion/convert_flux_checkpoints.py
index dc3e124956..3c5bba3dd3 100644
--- a/keras_hub/src/models/flux/convert_weights.py
+++ b/tools/checkpoint_conversion/convert_flux_checkpoints.py
@@ -10,6 +10,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
+from diffusers import FluxPipeline
+
+from keras_hub.src.models.flux.flux_model import Flux
+
 
 def convert_mlpembedder_weights(pytorch_model, keras_model):
     pytorch_in_layer_weight = (
@@ -182,3 +187,22 @@ def convert_flux_weights(pytorch_model, keras_model):
     convert_lastlayer_weights(
         pytorch_model.final_layer, keras_model.final_layer
     )
+
+
+def main(_):
+    model_id = "black-forest-labs/FLUX.1-schnell"
+    pipe = FluxPipeline.from_pretrained(
+        "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
+    )
+    pipe.enable_model_cpu_offload()
+
+    original_model = pipe.transformer
+    keras_model = Flux()
+
+    # for each layer, call the appropriate functions from above
+
+    # save keras model
+
+
+if __name__ == "__main__":
+    main()

From b4186597808f342e6448f5e3bd069a7b7101e40b Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 10:44:34 +0900
Subject: [PATCH 30/68] update weight conversion

---
 .../convert_flux_checkpoints.py               | 39 ++++++++++++-------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/tools/checkpoint_conversion/convert_flux_checkpoints.py b/tools/checkpoint_conversion/convert_flux_checkpoints.py
index 3c5bba3dd3..583367bbde 100644
--- a/tools/checkpoint_conversion/convert_flux_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_flux_checkpoints.py
@@ -10,8 +10,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
-from diffusers import FluxPipeline
+# Requires installation of source code from
+# https://github.com/black-forest-labs/flux
+
+from flux import util
 
 from keras_hub.src.models.flux.flux_model import Flux
 
@@ -190,18 +192,27 @@ def convert_flux_weights(pytorch_model, keras_model):
 
 
 def main(_):
-    model_id = "black-forest-labs/FLUX.1-schnell"
-    pipe = FluxPipeline.from_pretrained(
-        "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
-    )
-    pipe.enable_model_cpu_offload()
-
-    original_model = pipe.transformer
-    keras_model = Flux()
-
-    # for each layer, call the appropriate functions from above
-
-    # save keras model
+    original_flux_model = util.load_flow_model(
+        name="flux-schnell", device="cpu"
+    )
+    keras_model = Flux(
+        in_channels=64,
+        hidden_size=3072,
+        mlp_ratio=4.0,
+        num_heads=24,
+        depth=19,
+        depth_single_blocks=38,
+        axes_dim=[16, 56, 56],
+        theta=10_000,
+        qkv_bias=True,
+        guidance_embed=False,
+    )
+
+    convert_flux_weights(original_flux_model, keras_model)
+
+    # TODO:
+    # validation
+    # save
 
 
 if __name__ == "__main__":

From 99839af7d817332adf040fbb45f2cddc766dc0d8 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 10:55:30 +0900
Subject: [PATCH 31/68] add stand-in presets until weights are uploaded

---
 keras_hub/src/models/flux/__init__.py     |  5 +++++
 keras_hub/src/models/flux/flux_presets.py | 16 ++++++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 keras_hub/src/models/flux/flux_presets.py

diff --git a/keras_hub/src/models/flux/__init__.py b/keras_hub/src/models/flux/__init__.py
index e69de29bb2..7d6f4f9637 100644
--- a/keras_hub/src/models/flux/__init__.py
+++ b/keras_hub/src/models/flux/__init__.py
@@ -0,0 +1,5 @@
+from keras_hub.src.models.flux.flux_model import Flux
+from keras_hub.src.models.flux.flux_presets import presets
+from keras_hub.src.utils.preset_utils import register_presets
+
+register_presets(presets, Flux)
diff --git a/keras_hub/src/models/flux/flux_presets.py b/keras_hub/src/models/flux/flux_presets.py
new file mode 100644
index 0000000000..af66ff074d
--- /dev/null
+++ b/keras_hub/src/models/flux/flux_presets.py
@@ -0,0 +1,16 @@
+"""FLUX model preset configurations."""
+
+presets = {
+    "schnell": {
+        "metadata": {
+            "description": (
+                "A 12 billion parameter rectified flow transformer capable of generating images from text descriptions."
+            ),
+            "params": 124439808,
+            "official_name": "FLUX.1-schnell",
+            "path": "flux",
+            "model_card": "https://github.com/black-forest-labs/flux/blob/main/model_cards/FLUX.1-schnell.md",
+        },
+        "kaggle_handle": "TBA",
+    },
+}

From ac5c4b13d3c43fee4a72b0fc50af57d7c5fd66e5 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 11:20:24 +0900
Subject: [PATCH 32/68] set float32 to t.dtype in timestep embedding

---
 keras_hub/src/models/flux/flux_maths.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index f344a919b9..5470469673 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -37,7 +37,7 @@ def call(self, t, dim, max_period=10000, time_factor=1000.0):
         half_dim = dim // 2
         freqs = ops.exp(
             -ops.log(max_period)
-            * ops.arange(half_dim, dtype="float32")
+            * ops.arange(half_dim, dtype=t.dtype)
             / half_dim
         )
         args = t[:, None] * freqs[None]

From 89dc08c594d03b9d96b92afd14023cc2c79d5d9d Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 11:21:51 +0900
Subject: [PATCH 33/68] update more float32s into dynamic types

---
 keras_hub/src/models/flux/flux_maths.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 5470469673..3d72c59b9f 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -66,14 +66,14 @@ class RotaryPositionalEmbedding(keras.layers.Layer):
     """
 
     def call(self, pos, dim, theta):
-        scale = ops.arange(0, dim, 2, dtype="float32") / dim
+        scale = ops.arange(0, dim, 2, dtype=pos.dtype) / dim
         omega = 1.0 / (theta**scale)
         out = ops.einsum("...n,d->...nd", pos, omega)
         out = ops.stack(
             [ops.cos(out), -ops.sin(out), ops.sin(out), ops.cos(out)], axis=-1
         )
         out = rearrange(out, "... n d (i j) -> ... n d i j", i=2, j=2)
-        return ops.cast(out, dtype="float32")
+        return ops.cast(out, pos.dtype)
 
 
 class ApplyRoPE(keras.layers.Layer):
@@ -90,12 +90,8 @@ class ApplyRoPE(keras.layers.Layer):
     """
 
     def call(self, xq, xk, freqs_cis):
-        xq_ = ops.reshape(
-            ops.cast(xq, "float32"), (*ops.shape(xq)[:-1], -1, 1, 2)
-        )
-        xk_ = ops.reshape(
-            ops.cast(xk, "float32"), (*ops.shape(xk)[:-1], -1, 1, 2)
-        )
+        xq_ = ops.reshape(xq, (*ops.shape(xq)[:-1], -1, 1, 2))
+        xk_ = ops.reshape(xq, (*ops.shape(xk)[:-1], -1, 1, 2))
 
         xq_out = (
             freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
@@ -173,9 +169,7 @@ def scaled_dot_product_attention(
     """
     L, S = ops.shape(query)[-2], ops.shape(key)[-2]
     scale_factor = (
-        1 / ops.sqrt(ops.cast(ops.shape(query)[-1], "float32"))
-        if scale is None
-        else scale
+        1 / ops.sqrt(ops.shape(query)[-1]) if scale is None else scale
     )
     attn_bias = ops.zeros((L, S), dtype=query.dtype)
 

From d3de26bd582aa46cc54b16948b987b20b271f327 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 11:27:50 +0900
Subject: [PATCH 34/68] dtype

---
 keras_hub/src/models/flux/flux_maths.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 3d72c59b9f..6f0243316c 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -36,7 +36,7 @@ def call(self, t, dim, max_period=10000, time_factor=1000.0):
         t = time_factor * t
         half_dim = dim // 2
         freqs = ops.exp(
-            -ops.log(max_period)
+            ops.cast(-ops.log(max_period), dtype=t.dtype)
             * ops.arange(half_dim, dtype=t.dtype)
             / half_dim
         )

From 9d4aa222bf7ecc51a886a8491e57d305bb6bfa3c Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 11:30:25 +0900
Subject: [PATCH 35/68] dtype

---
 keras_hub/src/models/flux/flux_maths.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 6f0243316c..5470469673 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -36,7 +36,7 @@ def call(self, t, dim, max_period=10000, time_factor=1000.0):
         t = time_factor * t
         half_dim = dim // 2
         freqs = ops.exp(
-            ops.cast(-ops.log(max_period), dtype=t.dtype)
+            -ops.log(max_period)
             * ops.arange(half_dim, dtype=t.dtype)
             / half_dim
         )
@@ -66,14 +66,14 @@ class RotaryPositionalEmbedding(keras.layers.Layer):
     """
 
     def call(self, pos, dim, theta):
-        scale = ops.arange(0, dim, 2, dtype=pos.dtype) / dim
+        scale = ops.arange(0, dim, 2, dtype="float32") / dim
         omega = 1.0 / (theta**scale)
         out = ops.einsum("...n,d->...nd", pos, omega)
         out = ops.stack(
             [ops.cos(out), -ops.sin(out), ops.sin(out), ops.cos(out)], axis=-1
         )
         out = rearrange(out, "... n d (i j) -> ... n d i j", i=2, j=2)
-        return ops.cast(out, pos.dtype)
+        return ops.cast(out, dtype="float32")
 
 
 class ApplyRoPE(keras.layers.Layer):
@@ -90,8 +90,12 @@ class ApplyRoPE(keras.layers.Layer):
     """
 
     def call(self, xq, xk, freqs_cis):
-        xq_ = ops.reshape(xq, (*ops.shape(xq)[:-1], -1, 1, 2))
-        xk_ = ops.reshape(xq, (*ops.shape(xk)[:-1], -1, 1, 2))
+        xq_ = ops.reshape(
+            ops.cast(xq, "float32"), (*ops.shape(xq)[:-1], -1, 1, 2)
+        )
+        xk_ = ops.reshape(
+            ops.cast(xk, "float32"), (*ops.shape(xk)[:-1], -1, 1, 2)
+        )
 
         xq_out = (
             freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
@@ -169,7 +173,9 @@ def scaled_dot_product_attention(
     """
     L, S = ops.shape(query)[-2], ops.shape(key)[-2]
     scale_factor = (
-        1 / ops.sqrt(ops.shape(query)[-1]) if scale is None else scale
+        1 / ops.sqrt(ops.cast(ops.shape(query)[-1], "float32"))
+        if scale is None
+        else scale
     )
     attn_bias = ops.zeros((L, S), dtype=query.dtype)
 

From dbddde76f600a6407da93c988bd3692418557581 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 11:49:19 +0900
Subject: [PATCH 36/68] enable float16 mode

---
 keras_hub/src/models/flux/flux_maths.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 5470469673..5796a670f5 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -36,7 +36,7 @@ def call(self, t, dim, max_period=10000, time_factor=1000.0):
         t = time_factor * t
         half_dim = dim // 2
         freqs = ops.exp(
-            -ops.log(max_period)
+            ops.cast(-ops.log(max_period), dtype=t.dtype)
             * ops.arange(half_dim, dtype=t.dtype)
             / half_dim
         )
@@ -90,12 +90,8 @@ class ApplyRoPE(keras.layers.Layer):
     """
 
     def call(self, xq, xk, freqs_cis):
-        xq_ = ops.reshape(
-            ops.cast(xq, "float32"), (*ops.shape(xq)[:-1], -1, 1, 2)
-        )
-        xk_ = ops.reshape(
-            ops.cast(xk, "float32"), (*ops.shape(xk)[:-1], -1, 1, 2)
-        )
+        xq_ = ops.reshape(xq, (*ops.shape(xq)[:-1], -1, 1, 2))
+        xk_ = ops.reshape(xk, (*ops.shape(xk)[:-1], -1, 1, 2))
 
         xq_out = (
             freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
@@ -173,7 +169,7 @@ def scaled_dot_product_attention(
     """
     L, S = ops.shape(query)[-2], ops.shape(key)[-2]
     scale_factor = (
-        1 / ops.sqrt(ops.cast(ops.shape(query)[-1], "float32"))
+        1 / ops.sqrt(ops.cast(ops.shape(query)[-1], dtype=query.dtype))
         if scale is None
         else scale
     )

From b3c75a918effc199edd18afcef5282736093eb40 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 12:11:50 +0900
Subject: [PATCH 37/68] update conversion script to not require flux repo

---
 .../convert_flux_checkpoints.py               | 230 ++++++++++--------
 1 file changed, 131 insertions(+), 99 deletions(-)

diff --git a/tools/checkpoint_conversion/convert_flux_checkpoints.py b/tools/checkpoint_conversion/convert_flux_checkpoints.py
index 583367bbde..42c52f5abd 100644
--- a/tools/checkpoint_conversion/convert_flux_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_flux_checkpoints.py
@@ -13,188 +13,199 @@
 # Requires installation of source code from
 # https://github.com/black-forest-labs/flux
 
-from flux import util
+import os
+
+import keras
+import numpy as np
+from safetensors import safe_open
 
 from keras_hub.src.models.flux.flux_model import Flux
 
+DOWNLOAD_URL = "https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/flux1-schnell.safetensors"
+keras.config.set_dtype_policy("mixed_float16")
 
-def convert_mlpembedder_weights(pytorch_model, keras_model):
-    pytorch_in_layer_weight = (
-        pytorch_model.in_layer.weight.detach().cpu().numpy()
-    )
-    pytorch_in_layer_bias = pytorch_model.in_layer.bias.detach().cpu().numpy()
 
-    pytorch_out_layer_weight = (
-        pytorch_model.out_layer.weight.detach().cpu().numpy()
-    )
-    pytorch_out_layer_bias = pytorch_model.out_layer.bias.detach().cpu().numpy()
+def convert_mlpembedder_weights(weights_dict, keras_model, prefix):
+    in_layer_weight = weights_dict[f"{prefix}.in_layer.weight"].T
+    in_layer_bias = weights_dict[f"{prefix}.in_layer.bias"]
 
-    keras_model.in_layer.set_weights(
-        [pytorch_in_layer_weight.T, pytorch_in_layer_bias]
-    )
-    keras_model.out_layer.set_weights(
-        [pytorch_out_layer_weight.T, pytorch_out_layer_bias]
-    )
+    out_layer_weight = weights_dict[f"{prefix}.out_layer.weight"].T
+    out_layer_bias = weights_dict[f"{prefix}.out_layer.bias"]
 
+    keras_model.in_layer.set_weights([in_layer_weight, in_layer_bias])
+    keras_model.out_layer.set_weights([out_layer_weight, out_layer_bias])
 
-def convert_selfattention_weights(pytorch_model, keras_model):
 
-    pytorch_qkv_weight = pytorch_model.qkv.weight.detach().cpu().numpy()
-    pytorch_qkv_bias = (
-        pytorch_model.qkv.bias.detach().cpu().numpy()
-        if pytorch_model.qkv.bias is not None
-        else None
-    )
+def convert_selfattention_weights(weights_dict, keras_model, prefix):
+    qkv_weight = weights_dict[f"{prefix}.qkv.weight"].T
+    qkv_bias = weights_dict.get(f"{prefix}.qkv.bias")
 
-    pytorch_proj_weight = pytorch_model.proj.weight.detach().cpu().numpy()
-    pytorch_proj_bias = pytorch_model.proj.bias.detach().cpu().numpy()
+    proj_weight = weights_dict[f"{prefix}.proj.weight"].T
+    proj_bias = weights_dict[f"{prefix}.proj.bias"]
 
     keras_model.qkv.set_weights(
-        [pytorch_qkv_weight.T]
-        + ([pytorch_qkv_bias] if pytorch_qkv_bias is not None else [])
+        [qkv_weight] + ([qkv_bias] if qkv_bias is not None else [])
     )
-    keras_model.proj.set_weights([pytorch_proj_weight.T, pytorch_proj_bias])
+    keras_model.proj.set_weights([proj_weight, proj_bias])
 
 
-def convert_modulation_weights(pytorch_model, keras_model):
-    pytorch_weight = pytorch_model.lin.weight.detach().cpu().numpy()
-    pytorch_bias = pytorch_model.lin.bias.detach().cpu().numpy()
+def convert_modulation_weights(weights_dict, keras_model, prefix):
+    lin_weight = weights_dict[f"{prefix}.lin.weight"].T
+    lin_bias = weights_dict[f"{prefix}.lin.bias"]
 
-    keras_model.lin.set_weights([pytorch_weight.T, pytorch_bias])
+    keras_model.lin.set_weights([lin_weight, lin_bias])
 
 
-def convert_doublestreamblock_weights(pytorch_model, keras_model):
+def convert_doublestreamblock_weights(weights_dict, keras_model, block_idx):
     # Convert img_mod weights
-    convert_modulation_weights(pytorch_model.img_mod, keras_model.img_mod)
+    convert_modulation_weights(
+        weights_dict, keras_model.img_mod, f"double_blocks.{block_idx}.img_mod"
+    )
 
     # Convert txt_mod weights
-    convert_modulation_weights(pytorch_model.txt_mod, keras_model.txt_mod)
+    convert_modulation_weights(
+        weights_dict, keras_model.txt_mod, f"double_blocks.{block_idx}.txt_mod"
+    )
 
     # Convert img_attn weights
-    convert_selfattention_weights(pytorch_model.img_attn, keras_model.img_attn)
+    convert_selfattention_weights(
+        weights_dict,
+        keras_model.img_attn,
+        f"double_blocks.{block_idx}.img_attn",
+    )
 
     # Convert txt_attn weights
-    convert_selfattention_weights(pytorch_model.txt_attn, keras_model.txt_attn)
+    convert_selfattention_weights(
+        weights_dict,
+        keras_model.txt_attn,
+        f"double_blocks.{block_idx}.txt_attn",
+    )
 
-    # Convert img_mlp weights (2 Linear layers in PyTorch -> 2 Dense layers in Keras)
+    # Convert img_mlp weights (2 layers)
     keras_model.img_mlp.layers[0].set_weights(
         [
-            pytorch_model.img_mlp[0].weight.detach().cpu().numpy().T,
-            pytorch_model.img_mlp[0].bias.detach().cpu().numpy(),
+            weights_dict[f"double_blocks.{block_idx}.img_mlp.0.weight"].T,
+            weights_dict[f"double_blocks.{block_idx}.img_mlp.0.bias"],
         ]
     )
     keras_model.img_mlp.layers[2].set_weights(
         [
-            pytorch_model.img_mlp[2].weight.detach().cpu().numpy().T,
-            pytorch_model.img_mlp[2].bias.detach().cpu().numpy(),
+            weights_dict[f"double_blocks.{block_idx}.img_mlp.2.weight"].T,
+            weights_dict[f"double_blocks.{block_idx}.img_mlp.2.bias"],
         ]
     )
 
-    # Convert txt_mlp weights (2 Linear layers in PyTorch -> 2 Dense layers in Keras)
+    # Convert txt_mlp weights (2 layers)
     keras_model.txt_mlp.layers[0].set_weights(
         [
-            pytorch_model.txt_mlp[0].weight.detach().cpu().numpy().T,
-            pytorch_model.txt_mlp[0].bias.detach().cpu().numpy(),
+            weights_dict[f"double_blocks.{block_idx}.txt_mlp.0.weight"].T,
+            weights_dict[f"double_blocks.{block_idx}.txt_mlp.0.bias"],
         ]
     )
     keras_model.txt_mlp.layers[2].set_weights(
         [
-            pytorch_model.txt_mlp[2].weight.detach().cpu().numpy().T,
-            pytorch_model.txt_mlp[2].bias.detach().cpu().numpy(),
+            weights_dict[f"double_blocks.{block_idx}.txt_mlp.2.weight"].T,
+            weights_dict[f"double_blocks.{block_idx}.txt_mlp.2.bias"],
         ]
     )
 
 
-def convert_singlestreamblock_weights(pytorch_model, keras_model):
-    convert_modulation_weights(pytorch_model.modulation, keras_model.modulation)
+def convert_singlestreamblock_weights(weights_dict, keras_model, block_idx):
+    convert_modulation_weights(
+        weights_dict,
+        keras_model.modulation,
+        f"single_blocks.{block_idx}.modulation",
+    )
 
-    # Convert linear1 (Dense) weights
+    # Convert linear1 weights
     keras_model.linear1.set_weights(
         [
-            pytorch_model.linear1.weight.detach().cpu().numpy().T,
-            pytorch_model.linear1.bias.detach().cpu().numpy(),
+            weights_dict[f"single_blocks.{block_idx}.linear1.weight"].T,
+            weights_dict[f"single_blocks.{block_idx}.linear1.bias"],
         ]
     )
 
-    # Convert linear2 (Dense) weights
+    # Convert linear2 weights
     keras_model.linear2.set_weights(
         [
-            pytorch_model.linear2.weight.detach().cpu().numpy().T,
-            pytorch_model.linear2.bias.detach().cpu().numpy(),
+            weights_dict[f"single_blocks.{block_idx}.linear2.weight"].T,
+            weights_dict[f"single_blocks.{block_idx}.linear2.bias"],
         ]
     )
 
 
-def convert_lastlayer_weights(pytorch_model, keras_model):
-
-    # Convert linear (Dense) weights
+def convert_lastlayer_weights(weights_dict, keras_model):
+    # Convert linear weights
     keras_model.linear.set_weights(
         [
-            pytorch_model.linear.weight.detach().cpu().numpy().T,
-            pytorch_model.linear.bias.detach().cpu().numpy(),
+            weights_dict["final_layer.linear.weight"].T,
+            weights_dict["final_layer.linear.bias"],
         ]
     )
 
-    # Convert adaLN_modulation (Sequential) weights
+    # Convert adaLN_modulation weights
     keras_model.adaLN_modulation.layers[1].set_weights(
         [
-            pytorch_model.adaLN_modulation[1].weight.detach().cpu().numpy().T,
-            pytorch_model.adaLN_modulation[1].bias.detach().cpu().numpy(),
+            weights_dict["final_layer.adaLN_modulation.1.weight"].T,
+            weights_dict["final_layer.adaLN_modulation.1.bias"],
         ]
     )
 
 
-def convert_flux_weights(pytorch_model, keras_model):
-    # Convert img_in (Dense) weights
+def convert_flux_weights(weights_dict, keras_model):
+    # Convert img_in weights
     keras_model.img_in.set_weights(
-        [
-            pytorch_model.img_in.weight.detach().cpu().numpy().T,
-            pytorch_model.img_in.bias.detach().cpu().numpy(),
-        ]
+        [weights_dict["img_in.weight"].T, weights_dict["img_in.bias"]]
     )
 
-    # Convert time_in (MLPEmbedder) weights
-    convert_mlpembedder_weights(pytorch_model.time_in, keras_model.time_in)
+    # Convert time_in weights (MLPEmbedder)
+    convert_mlpembedder_weights(weights_dict, keras_model.time_in, "time_in")
 
-    # Convert vector_in (MLPEmbedder) weights
-    convert_mlpembedder_weights(pytorch_model.vector_in, keras_model.vector_in)
+    # Convert vector_in weights (MLPEmbedder)
+    convert_mlpembedder_weights(
+        weights_dict, keras_model.vector_in, "vector_in"
+    )
 
-    # Convert guidance_in (if present)
-    if keras_model.guidance_embed:
+    # Convert guidance_in weights (if present)
+    if hasattr(keras_model, "guidance_embed"):
         convert_mlpembedder_weights(
-            pytorch_model.guidance_in, keras_model.guidance_in
+            weights_dict, keras_model.guidance_in, "guidance_in"
         )
 
-    # Convert txt_in (Dense) weights
+    # Convert txt_in weights
     keras_model.txt_in.set_weights(
-        [
-            pytorch_model.txt_in.weight.detach().cpu().numpy().T,
-            pytorch_model.txt_in.bias.detach().cpu().numpy(),
-        ]
+        [weights_dict["txt_in.weight"].T, weights_dict["txt_in.bias"]]
     )
 
-    # Convert double_blocks (DoubleStreamBlock) weights
-    for pt_block, keras_block in zip(
-        pytorch_model.double_blocks, keras_model.double_blocks
-    ):
-        convert_doublestreamblock_weights(pt_block, keras_block)
+    # Convert double_blocks weights
+    for block_idx in range(len(keras_model.double_blocks)):
+        convert_doublestreamblock_weights(
+            weights_dict, keras_model.double_blocks[block_idx], block_idx
+        )
 
-    # Convert single_blocks (SingleStreamBlock) weights
-    for pt_block, keras_block in zip(
-        pytorch_model.single_blocks, keras_model.single_blocks
-    ):
-        convert_singlestreamblock_weights(pt_block, keras_block)
+    # Convert single_blocks weights
+    for block_idx in range(len(keras_model.single_blocks)):
+        convert_singlestreamblock_weights(
+            weights_dict, keras_model.single_blocks[block_idx], block_idx
+        )
 
-    # Convert final_layer (LastLayer) weights
-    convert_lastlayer_weights(
-        pytorch_model.final_layer, keras_model.final_layer
-    )
+    # Convert final_layer weights
+    convert_lastlayer_weights(weights_dict, keras_model.final_layer)
 
 
 def main(_):
-    original_flux_model = util.load_flow_model(
-        name="flux-schnell", device="cpu"
-    )
+    # get the original weights
+    print("Downloading weights")
+
+    os.system(f"wget {DOWNLOAD_URL}")
+
+    flux_weights = {}
+    with safe_open(
+        "flux1-schnell.safetensors", framework="pt", device="cpu"
+    ) as f:
+        for key in f.keys():
+            flux_weights[key] = f.get_tensor(key)
+
     keras_model = Flux(
         in_channels=64,
         hidden_size=3072,
@@ -208,12 +219,33 @@ def main(_):
         guidance_embed=False,
     )
 
-    convert_flux_weights(original_flux_model, keras_model)
+    # Run dummy input to build model
+    img = np.random.rand(1, 96, 64).astype(np.float16)
+    txt = np.random.rand(1, 96, 64).astype(np.float16)
+    img_ids = np.random.randint(0, 100, (1, 96, 3)).astype(np.float16)
+    txt_ids = np.random.randint(0, 100, (1, 96, 3)).astype(np.float16)
+    timesteps = np.random.rand(32).astype(np.float16)
+    y = np.random.rand(1, 64).astype(np.float16)
+    guidance = np.random.rand(32).astype(np.float16)
+
+    keras_model(
+        img=img,
+        txt=txt,
+        img_ids=img_ids,
+        txt_ids=txt_ids,
+        timesteps=timesteps,
+        y=y,
+        guidance=guidance,
+    )
+
+    convert_flux_weights(flux_weights, keras_model)
 
     # TODO:
     # validation
     # save
 
+    os.remove("flux1-schnell.safetensors")
+
 
 if __name__ == "__main__":
     main()

From 4333bab872ff0092e15c6f99a122826e57100bee Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 13:15:55 +0900
Subject: [PATCH 38/68] add build() methods to avoid running dummy input
 through model

---
 keras_hub/src/models/flux/flux_layers.py | 54 ++++++++++++++++++++++++
 keras_hub/src/models/flux/flux_model.py  | 51 ++++++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 57767c9950..a369d55399 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -47,6 +47,11 @@ def __init__(self, dim: int, theta: int, axes_dim: list[int]):
         self.axes_dim = axes_dim
         self.rope = RotaryPositionalEmbedding()
 
+    def build(self, input_shape):
+        n_axes = input_shape[-1]
+        for i in range(n_axes):
+            self.rope.build((input_shape[:-1] + (self.axes_dim[i],)))
+
     def call(self, ids):
         """
         Computes the positional embeddings for each axis and concatenates them.
@@ -89,6 +94,10 @@ def __init__(self, hidden_dim: int):
         self.silu = layers.Activation("silu")
         self.out_layer = layers.Dense(hidden_dim, use_bias=True)
 
+    def build(self, input_shape):
+        self.in_layer.build(input_shape)
+        self.out_layer.build((input_shape[0], self.in_layer.units))
+
     def call(self, x: KerasTensor) -> KerasTensor:
         """
         Applies the MLP embedding to the input tensor.
@@ -161,6 +170,10 @@ def __init__(self, dim: int):
         self.query_norm = RMSNorm(dim)
         self.key_norm = RMSNorm(dim)
 
+    def build(self, input_shape):
+        self.query_norm.build(input_shape)
+        self.key_norm.build(input_shape)
+
     def call(
         self, q: KerasTensor, k: KerasTensor
     ) -> tuple[KerasTensor, KerasTensor]:
@@ -206,6 +219,12 @@ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
         self.proj = layers.Dense(dim)
         self.attention = FluxRoPEAttention()
 
+    def build(self, input_shape):
+        self.qkv.build(input_shape)
+        head_dim = input_shape[-1] // self.num_heads
+        self.norm.build((None, input_shape[1], head_dim))
+        self.proj.build((None, input_shape[1], input_shape[-1]))
+
     def call(self, x, pe):
         """
         Applies self-attention with RoPE embeddings.
@@ -256,6 +275,9 @@ def __init__(self, dim, double):
         self.multiplier = 6 if double else 3
         self.lin = keras.layers.Dense(self.multiplier * dim, use_bias=True)
 
+    def build(self, input_shape):
+        self.lin.build(input_shape)
+
     def call(self, x):
         """
         Generates modulation parameters from the input tensor.
@@ -335,6 +357,25 @@ def __init__(
         )
         self.attention = FluxRoPEAttention()
 
+    def build(self, input_shape):
+        # Build components for image and text streams
+        img_input_shape, txt_input_shape, vec_shape, pe_shape = input_shape
+        self.img_mod.build(vec_shape)
+        self.img_norm1.build(img_input_shape)
+        self.img_attn.build(
+            (img_input_shape[0], img_input_shape[1], self.hidden_size)
+        )
+        self.img_norm2.build(img_input_shape)
+        self.img_mlp.build(img_input_shape)
+
+        self.txt_mod.build(vec_shape)
+        self.txt_norm1.build(txt_input_shape)
+        self.txt_attn.build(
+            (txt_input_shape[0], txt_input_shape[1], self.hidden_size)
+        )
+        self.txt_norm2.build(txt_input_shape)
+        self.txt_mlp.build(txt_input_shape)
+
     def call(self, img, txt, vec, pe):
         """
         Forward pass for the DoubleStreamBlock.
@@ -429,6 +470,13 @@ def __init__(
         self.modulation = Modulation(hidden_size, double=False)
         self.attention = FluxRoPEAttention()
 
+    def build(self, input_shape):
+        x_shape, vec_shape, pe_shape = input_shape
+        self.modulation.build(vec_shape)
+        self.pre_norm.build(x_shape)
+        self.linear1.build(x_shape)
+        self.linear2.build((x_shape[0], x_shape[1], self.hidden_size))
+
     def call(self, x, vec, pe):
         """
         Forward pass for the SingleStreamBlock.
@@ -486,6 +534,12 @@ def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
             ]
         )
 
+    def build(self, input_shape):
+        x_shape, vec_shape = input_shape
+        self.norm_final.build(x_shape)
+        self.linear.build((x_shape[0], x_shape[1], x_shape[2] * x_shape[3]))
+        self.adaLN_modulation.build(vec_shape)
+
     def call(self, x, vec):
         """
         Forward pass for the LastLayer.
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index ffa89958ba..4951dc936c 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -73,6 +73,57 @@ def __init__(
         self.timestep_embedding = TimestepEmbedding()
         self.guidance_embed = guidance_embed
 
+    def build(self, input_shape):
+        (
+            img_shape,
+            img_ids_shape,
+            txt_shape,
+            txt_ids_shape,
+            timestep_shape,
+            y_shape,
+            guidance_shape,
+        ) = input_shape
+
+        # Build input layers
+        self.img_in.build(img_shape)
+        self.txt_in.build(txt_shape)
+
+        # Build timestep embedding, vector inputs
+        self.timestep_embedding.build(timestep_shape)
+        self.time_in.build((None, 256))
+        self.vector_in.build(y_shape)
+
+        if self.guidance_embed:
+            if guidance_shape is None:
+                raise ValueError(
+                    "Guidance shape must be provided for guidance-distilled model."
+                )
+            self.guidance_in.build(
+                (None, 256)
+            )
+
+        # Build positional embedder
+        ids_shape = (
+            None,
+            img_ids_shape[1] + txt_ids_shape[1],
+            img_ids_shape[2],
+        )
+        self.pe_embedder.build(ids_shape)
+
+        # Build double stream blocks
+        for block in self.double_blocks:
+            block.build((img_shape, txt_shape, (None, 256), ids_shape))
+
+        # Build single stream blocks
+        concat_img_shape = (None, img_shape[1] + txt_shape[1], img_shape[2])
+        for block in self.single_blocks:
+            block.build((concat_img_shape, (None, 256), ids_shape))
+
+        # Build final layer
+        self.final_layer.build((None, img_shape[1], self.hidden_size))
+
+        self.built = True
+
     def call(
         self,
         img,

From 199ba1ce3a1f3df86f69a85fc4e25df84e4195dc Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 13:21:31 +0900
Subject: [PATCH 39/68] update build call

---
 keras_hub/src/models/flux/flux_layers.py | 2 +-
 keras_hub/src/models/flux/flux_model.py  | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index a369d55399..37298e90c0 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -535,7 +535,7 @@ def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
         )
 
     def build(self, input_shape):
-        x_shape, vec_shape = input_shape
+        batch_size, x_shape, vec_shape = input_shape
         self.norm_final.build(x_shape)
         self.linear.build((x_shape[0], x_shape[1], x_shape[2] * x_shape[3]))
         self.adaLN_modulation.build(vec_shape)
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 4951dc936c..fb80c7c591 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -98,9 +98,7 @@ def build(self, input_shape):
                 raise ValueError(
                     "Guidance shape must be provided for guidance-distilled model."
                 )
-            self.guidance_in.build(
-                (None, 256)
-            )
+            self.guidance_in.build((None, 256))
 
         # Build positional embedder
         ids_shape = (

From a8de665e83afd5c9e6b2752a2a8f09b51e01ca6e Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 13:40:37 +0900
Subject: [PATCH 40/68] fix build calls

---
 keras_hub/src/models/flux/flux_layers.py |  8 ++++----
 keras_hub/src/models/flux/flux_model.py  | 21 +++++++++++++++------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 37298e90c0..fbb141b6fe 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -535,10 +535,10 @@ def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
         )
 
     def build(self, input_shape):
-        batch_size, x_shape, vec_shape = input_shape
-        self.norm_final.build(x_shape)
-        self.linear.build((x_shape[0], x_shape[1], x_shape[2] * x_shape[3]))
-        self.adaLN_modulation.build(vec_shape)
+        batch_size, seq_length, features = input_shape
+
+        self.linear.build((None, features))
+        self.built = True
 
     def call(self, x, vec):
         """
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index fb80c7c591..e615e0f134 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -88,9 +88,9 @@ def build(self, input_shape):
         self.img_in.build(img_shape)
         self.txt_in.build(txt_shape)
 
-        # Build timestep embedding, vector inputs
+        # Build timestep embedding and vector inputs
         self.timestep_embedding.build(timestep_shape)
-        self.time_in.build((None, 256))
+        self.time_in.build((None, 256))  # timestep embedding size is 256
         self.vector_in.build(y_shape)
 
         if self.guidance_embed:
@@ -98,7 +98,9 @@ def build(self, input_shape):
                 raise ValueError(
                     "Guidance shape must be provided for guidance-distilled model."
                 )
-            self.guidance_in.build((None, 256))
+            self.guidance_in.build(
+                (None, 256)
+            )  # guidance embedding size is 256
 
         # Build positional embedder
         ids_shape = (
@@ -113,14 +115,21 @@ def build(self, input_shape):
             block.build((img_shape, txt_shape, (None, 256), ids_shape))
 
         # Build single stream blocks
-        concat_img_shape = (None, img_shape[1] + txt_shape[1], img_shape[2])
+        concat_img_shape = (
+            None,
+            img_shape[1] + txt_shape[1],
+            self.hidden_size,
+        )  # Concatenated shape
         for block in self.single_blocks:
             block.build((concat_img_shape, (None, 256), ids_shape))
 
         # Build final layer
-        self.final_layer.build((None, img_shape[1], self.hidden_size))
+        # Adjusted to match expected input shape for the final layer
+        self.final_layer.build(
+            (None, img_shape[1] + txt_shape[1], self.hidden_size)
+        )  # Concatenated shape
 
-        self.built = True
+        self.built = True  # Mark as built
 
     def call(
         self,

From efe993a7ed6c2a1c81b13432c97d58cbe281e10d Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 14:01:47 +0900
Subject: [PATCH 41/68] style

---
 keras_hub/src/models/flux/flux_layers.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index fbb141b6fe..c54b36398b 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -90,6 +90,7 @@ def __init__(self, hidden_dim: int):
             hidden_dim (int): The dimensionality of the hidden layer.
         """
         super().__init__()
+        self.hidden_dim = hidden_dim
         self.in_layer = layers.Dense(hidden_dim, use_bias=True)
         self.silu = layers.Activation("silu")
         self.out_layer = layers.Dense(hidden_dim, use_bias=True)
@@ -213,6 +214,7 @@ def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
+        self.dim = dim
 
         self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias)
         self.norm = QKNorm(head_dim)
@@ -271,6 +273,7 @@ def __init__(self, dim, double):
             double (bool): Whether to generate two sets of modulation parameters.
         """
         super().__init__()
+        self.dim = dim
         self.is_double = double
         self.multiplier = 6 if double else 3
         self.lin = keras.layers.Dense(self.multiplier * dim, use_bias=True)

From ff118bb0c33297fc700cc271a7c9afde16649d57 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 7 Oct 2024 14:11:31 +0900
Subject: [PATCH 42/68] change dummy call into build() call

---
 .../convert_flux_checkpoints.py               | 37 ++++++++++---------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/tools/checkpoint_conversion/convert_flux_checkpoints.py b/tools/checkpoint_conversion/convert_flux_checkpoints.py
index 42c52f5abd..e44813745a 100644
--- a/tools/checkpoint_conversion/convert_flux_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_flux_checkpoints.py
@@ -219,23 +219,26 @@ def main(_):
         guidance_embed=False,
     )
 
-    # Run dummy input to build model
-    img = np.random.rand(1, 96, 64).astype(np.float16)
-    txt = np.random.rand(1, 96, 64).astype(np.float16)
-    img_ids = np.random.randint(0, 100, (1, 96, 3)).astype(np.float16)
-    txt_ids = np.random.randint(0, 100, (1, 96, 3)).astype(np.float16)
-    timesteps = np.random.rand(32).astype(np.float16)
-    y = np.random.rand(1, 64).astype(np.float16)
-    guidance = np.random.rand(32).astype(np.float16)
-
-    keras_model(
-        img=img,
-        txt=txt,
-        img_ids=img_ids,
-        txt_ids=txt_ids,
-        timesteps=timesteps,
-        y=y,
-        guidance=guidance,
+    # Define input shapes
+    img_shape = (1, 96, 64)
+    txt_shape = (1, 96, 64)
+    img_ids_shape = (1, 96, 3)
+    txt_ids_shape = (1, 96, 3)
+    timestep_shape = (32,)
+    y_shape = (1, 64)
+    guidance_shape = (32,)
+
+    # Build the model
+    keras_model.build(
+        (
+            img_shape,
+            img_ids_shape,
+            txt_shape,
+            txt_ids_shape,
+            timestep_shape,
+            y_shape,
+            guidance_shape,
+        )
     )
 
     convert_flux_weights(flux_weights, keras_model)

From a3ccf6d7255e2e543bf0e5bb662a82a5e56cc86d Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 8 Oct 2024 19:46:42 +0900
Subject: [PATCH 43/68] reference einops issue

---
 requirements-common.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index b21dc49b1f..c935b10f23 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,4 +19,6 @@ sentencepiece
 tensorflow-datasets
 safetensors
 pillow
-einops
+# Will be replaced once https://github.com/keras-team/keras/issues/20332
+# is resolved
+einops 

From f88e1e90ef2d8424d3716ef9bf937ecb0f9ed690 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 8 Oct 2024 20:01:02 +0900
Subject: [PATCH 44/68] address docstring comments in flux layers

---
 keras_hub/src/models/flux/flux_layers.py | 136 ++++++++++-------------
 1 file changed, 56 insertions(+), 80 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index c54b36398b..53d9a0afa9 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -1,5 +1,3 @@
-# Copyright 2024 The KerasHub Authors
-#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -30,19 +28,14 @@ class EmbedND(keras.Model):
 
     This layer applies RoPE embeddings across multiple axes of the input tensor and
     concatenates the embeddings along a specified axis.
-    """
 
-    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
-        """
-        Initializes the EmbedND layer.
+    Args:
+        theta: int. Rotational angle parameter for RoPE.
+        axes_dim: list[int]. Dimensionality for each axis of the input tensor.
+    """
 
-        Args:
-            dim (int): Dimensionality of the embedding.
-            theta (int): Rotational angle parameter for RoPE.
-            axes_dim (list[int]): Dimensionality for each axis of the input tensor.
-        """
+    def __init__(self, theta: int, axes_dim: list[int]):
         super().__init__()
-        self.dim = dim
         self.theta = theta
         self.axes_dim = axes_dim
         self.rope = RotaryPositionalEmbedding()
@@ -57,7 +50,7 @@ def call(self, ids):
         Computes the positional embeddings for each axis and concatenates them.
 
         Args:
-            ids (KerasTensor): Input tensor of shape (..., num_axes).
+            ids: KerasTensor. Input tensor of shape (..., num_axes).
 
         Returns:
             KerasTensor: Positional embeddings of shape (..., concatenated_dim, 1, ...).
@@ -80,15 +73,12 @@ class MLPEmbedder(keras.Model):
 
     This model applies a linear transformation followed by the SiLU activation
     function and another linear transformation to the input tensor.
+
+    Args:
+        hidden_dim: int. The dimensionality of the hidden layer.
     """
 
     def __init__(self, hidden_dim: int):
-        """
-        Initializes the MLPEmbedder.
-
-        Args:
-            hidden_dim (int): The dimensionality of the hidden layer.
-        """
         super().__init__()
         self.hidden_dim = hidden_dim
         self.in_layer = layers.Dense(hidden_dim, use_bias=True)
@@ -99,12 +89,12 @@ def build(self, input_shape):
         self.in_layer.build(input_shape)
         self.out_layer.build((input_shape[0], self.in_layer.units))
 
-    def call(self, x: KerasTensor) -> KerasTensor:
+    def call(self, x):
         """
         Applies the MLP embedding to the input tensor.
 
         Args:
-            x (KerasTensor): Input tensor of shape (batch_size, in_dim).
+            x: KerasTensor. Input tensor of shape (batch_size, in_dim).
 
         Returns:
             KerasTensor: Output tensor of shape (batch_size, hidden_dim) after applying
@@ -122,26 +112,23 @@ class RMSNorm(keras.layers.Layer):
 
     This layer normalizes the input tensor based on its RMS value and applies
     a learned scaling factor.
+
+    Args:
+        dim: int. The dimensionality of the input tensor.
     """
 
     def __init__(self, dim: int):
-        """
-        Initializes the RMSNorm layer.
-
-        Args:
-            dim (int): The dimensionality of the input tensor.
-        """
         super().__init__()
         self.scale = self.add_weight(
             name="scale", shape=(dim,), initializer="ones"
         )
 
-    def call(self, x: KerasTensor) -> KerasTensor:
+    def call(self, x):
         """
         Applies RMS normalization to the input tensor.
 
         Args:
-            x (KerasTensor): Input tensor of shape (batch_size, dim).
+            x: KerasTensor. Input tensor of shape (batch_size, dim).
 
         Returns:
             KerasTensor: The RMS-normalized tensor of the same shape (batch_size, dim),
@@ -158,15 +145,12 @@ class QKNorm(keras.layers.Layer):
 
     This layer normalizes the input query and key tensors using separate RMSNorm
     layers for each.
+
+    Args:
+        dim: int. The dimensionality of the input query and key tensors.
     """
 
     def __init__(self, dim: int):
-        """
-        Initializes the QKNorm layer.
-
-        Args:
-            dim (int): The dimensionality of the input query and key tensors.
-        """
         super().__init__()
         self.query_norm = RMSNorm(dim)
         self.key_norm = RMSNorm(dim)
@@ -175,15 +159,13 @@ def build(self, input_shape):
         self.query_norm.build(input_shape)
         self.key_norm.build(input_shape)
 
-    def call(
-        self, q: KerasTensor, k: KerasTensor
-    ) -> tuple[KerasTensor, KerasTensor]:
+    def call(self, q, k):
         """
         Applies RMS normalization to the query and key tensors.
 
         Args:
-            q (KerasTensor): The query tensor of shape (batch_size, dim).
-            k (KerasTensor): The key tensor of shape (batch_size, dim).
+            q: KerasTensor. The query tensor of shape (batch_size, dim).
+            k: KerasTensor. The key tensor of shape (batch_size, dim).
 
         Returns:
             tuple[KerasTensor, KerasTensor]: A tuple containing the normalized query and key tensors.
@@ -199,18 +181,15 @@ class SelfAttention(keras.Model):
 
     This layer performs self-attention over the input sequence and applies RMS
     normalization to the query and key tensors before computing the attention scores.
+
+    Args:
+        dim: int. Dimensionality of the input tensor.
+        num_heads: int. Number of attention heads. Default is 8.
+        qkv_bias: bool. Whether to use bias in the query, key, value projection layers.
+            Default is False.
     """
 
     def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
-        """
-        Initializes the SelfAttention layer.
-
-        Args:
-            dim (int): Dimensionality of the input tensor.
-            num_heads (int): Number of attention heads. Default is 8.
-            qkv_bias (bool): Whether to use bias in the query, key, value projection layers.
-                Default is False.
-        """
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
@@ -232,8 +211,8 @@ def call(self, x, pe):
         Applies self-attention with RoPE embeddings.
 
         Args:
-            x (KerasTensor): Input tensor of shape (batch_size, seq_len, dim).
-            pe (KerasTensor): Positional encoding tensor for RoPE.
+            x: KerasTensor. Input tensor of shape (batch_size, seq_len, dim).
+            pe: KerasTensor. Positional encoding tensor for RoPE.
 
         Returns:
             KerasTensor: Output tensor after self-attention and projection.
@@ -262,16 +241,13 @@ class Modulation(keras.Model):
     This layer applies a SiLU activation to the input tensor followed by a linear
     transformation to generate modulation parameters. It can optionally generate two
     sets of modulation parameters.
+
+    Args:
+        dim: int. Dimensionality of the modulation output.
+        double: bool. Whether to generate two sets of modulation parameters.
     """
 
     def __init__(self, dim, double):
-        """
-        Initializes the Modulation layer.
-
-        Args:
-            dim (int): Dimensionality of the modulation output.
-            double (bool): Whether to generate two sets of modulation parameters.
-        """
         super().__init__()
         self.dim = dim
         self.is_double = double
@@ -286,7 +262,7 @@ def call(self, x):
         Generates modulation parameters from the input tensor.
 
         Args:
-            x (KerasTensor): Input tensor.
+            x: KerasTensor. Input tensor.
 
         Returns:
             tuple[ModulationOut, ModulationOut | None]: A tuple containing the shift,
@@ -310,10 +286,10 @@ class DoubleStreamBlock(keras.Model):
     self-attention and MLP layers, with modulation.
 
     Args:
-        hidden_size (int): The hidden dimension size for the model.
-        num_heads (int): The number of attention heads.
-        mlp_ratio (float): The ratio of the MLP hidden dimension to the hidden size.
-        qkv_bias (bool, optional): Whether to include bias in QKV projection. Default is False.
+        hidden_size: int. The hidden dimension size for the model.
+        num_heads: int. The number of attention heads.
+        mlp_ratio: float. The ratio of the MLP hidden dimension to the hidden size.
+        qkv_bias: bool, optional. Whether to include bias in QKV projection. Default is False.
     """
 
     def __init__(
@@ -384,10 +360,10 @@ def call(self, img, txt, vec, pe):
         Forward pass for the DoubleStreamBlock.
 
         Args:
-            img (KerasTensor): Input image tensor.
-            txt (KerasTensor): Input text tensor.
-            vec (KerasTensor): Modulation vector.
-            pe (KerasTensor): Positional encoding tensor.
+            img: KerasTensor. Input image tensor.
+            txt: KerasTensor. Input text tensor.
+            vec: KerasTensor. Modulation vector.
+            pe: KerasTensor. Positional encoding tensor.
 
         Returns:
             Tuple[KerasTensor, KerasTensor]: The modified image and text tensors.
@@ -441,10 +417,10 @@ class SingleStreamBlock(keras.Model):
     https://arxiv.org/abs/2302.05442 and adapted modulation interface.
 
     Args:
-        hidden_size (int): The hidden dimension size for the model.
-        num_heads (int): The number of attention heads.
-        mlp_ratio (float, optional): The ratio of the MLP hidden dimension to the hidden size. Default is 4.0.
-        qk_scale (float, optional): Scaling factor for the query-key product. Default is None.
+        hidden_size: int. The hidden dimension size for the model.
+        num_heads: int. The number of attention heads.
+        mlp_ratio: float, optional. The ratio of the MLP hidden dimension to the hidden size. Default is 4.0.
+        qk_scale: float, optional. Scaling factor for the query-key product. Default is None.
     """
 
     def __init__(
@@ -485,9 +461,9 @@ def call(self, x, vec, pe):
         Forward pass for the SingleStreamBlock.
 
         Args:
-            x (KerasTensor): Input tensor.
-            vec (KerasTensor): Modulation vector.
-            pe (KerasTensor): Positional encoding tensor.
+            x: KerasTensor. Input tensor.
+            vec: KerasTensor. Modulation vector.
+            pe: KerasTensor. Positional encoding tensor.
 
         Returns:
             KerasTensor: The modified input tensor after processing.
@@ -519,9 +495,9 @@ class LastLayer(keras.Model):
     Final layer for processing output tensors with adaptive normalization.
 
     Args:
-        hidden_size (int): The hidden dimension size for the model.
-        patch_size (int): The size of each patch.
-        out_channels (int): The number of output channels.
+        hidden_size: int. The hidden dimension size for the model.
+        patch_size: int. The size of each patch.
+        out_channels: int. The number of output channels.
     """
 
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
@@ -538,7 +514,7 @@ def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
         )
 
     def build(self, input_shape):
-        batch_size, seq_length, features = input_shape
+        _, _, features = input_shape
 
         self.linear.build((None, features))
         self.built = True
@@ -548,8 +524,8 @@ def call(self, x, vec):
         Forward pass for the LastLayer.
 
         Args:
-            x (KerasTensor): Input tensor.
-            vec (KerasTensor): Modulation vector.
+            x: KerasTensor. Input tensor.
+            vec: KerasTensor. Modulation vector.
 
         Returns:
             KerasTensor: The output tensor after final processing.

From 6e2c320f0c5ddee4b45ec1fcaf38271a80f9d433 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 8 Oct 2024 20:03:45 +0900
Subject: [PATCH 45/68] address docstring comments in flux maths

---
 keras_hub/src/models/flux/flux_maths.py | 56 ++++++++++++-------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 5796a670f5..340a7f454f 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -19,13 +19,12 @@ class TimestepEmbedding(keras.layers.Layer):
     """
     Creates sinusoidal timestep embeddings.
 
-
-    Call Args:
-        t (KerasTensor): A 1-D tensor of shape (N,), representing N indices, one per batch element.
-                         These values may be fractional.
-        dim (int): The dimension of the output.
-        max_period (int, optional): Controls the minimum frequency of the embeddings. Defaults to 10000.
-        time_factor (float, optional): A scaling factor applied to `t`. Defaults to 1000.0.
+    Args:
+        t: KerasTensor of shape (N,), representing N indices, one per batch element.
+            These values may be fractional.
+        dim: int. The dimension of the output.
+        max_period: int, optional. Controls the minimum frequency of the embeddings. Defaults to 10000.
+        time_factor: float, optional. A scaling factor applied to `t`. Defaults to 1000.0.
 
     Returns:
         KerasTensor: A tensor of shape (N, D) representing the positional embeddings,
@@ -55,11 +54,10 @@ class RotaryPositionalEmbedding(keras.layers.Layer):
     """
     Applies Rotary Positional Embedding (RoPE) to the input tensor.
 
-
-    Call Args:
-        pos (KerasTensor): The positional tensor with shape (..., n, d).
-        dim (int): The embedding dimension, should be even.
-        theta (int): The base frequency.
+    Args:
+        pos: KerasTensor. The positional tensor with shape (..., n, d).
+        dim: int. The embedding dimension, should be even.
+        theta: int. The base frequency.
 
     Returns:
         KerasTensor: The tensor with applied RoPE transformation.
@@ -80,10 +78,10 @@ class ApplyRoPE(keras.layers.Layer):
     """
     Applies the RoPE transformation to the query and key tensors.
 
-    Call Args:
-        xq (KerasTensor): The query tensor of shape (..., L, D).
-        xk (KerasTensor): The key tensor of shape (..., L, D).
-        freqs_cis (KerasTensor): The frequency complex numbers tensor with shape (..., 2).
+    Args:
+        xq: KerasTensor. The query tensor of shape (..., L, D).
+        xk: KerasTensor. The key tensor of shape (..., L, D).
+        freqs_cis: KerasTensor. The frequency complex numbers tensor with shape (..., 2).
 
     Returns:
         tuple[KerasTensor, KerasTensor]: The transformed query and key tensors.
@@ -110,14 +108,14 @@ class FluxRoPEAttention(keras.layers.Layer):
     Computes the attention mechanism with the RoPE transformation applied to the query and key tensors.
 
     Args:
-        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
-        is_causal (bool, optional): If True, applies causal masking. Defaults to False.
+        dropout_p: float, optional. Dropout probability. Defaults to 0.0.
+        is_causal: bool, optional. If True, applies causal masking. Defaults to False.
 
     Call Args:
-        q (KerasTensor): Query tensor of shape (..., L, D).
-        k (KerasTensor): Key tensor of shape (..., S, D).
-        v (KerasTensor): Value tensor of shape (..., S, D).
-        pe (KerasTensor): Positional encoding tensor.
+        q: KerasTensor. Query tensor of shape (..., L, D).
+        k: KerasTensor. Key tensor of shape (..., S, D).
+        v: KerasTensor. Value tensor of shape (..., S, D).
+        pe: KerasTensor. Positional encoding tensor.
 
     Returns:
         KerasTensor: The resulting tensor from the attention mechanism.
@@ -156,13 +154,13 @@ def scaled_dot_product_attention(
     Computes the scaled dot-product attention.
 
     Args:
-        query (KerasTensor): Query tensor of shape (..., L, D).
-        key (KerasTensor): Key tensor of shape (..., S, D).
-        value (KerasTensor): Value tensor of shape (..., S, D).
-        attn_mask (KerasTensor, optional): Attention mask tensor. Defaults to None.
-        dropout_p (float, optional): Dropout probability. Defaults to 0.0.
-        is_causal (bool, optional): If True, applies causal masking. Defaults to False.
-        scale (float, optional): Scale factor for attention. Defaults to None.
+        query: KerasTensor. Query tensor of shape (..., L, D).
+        key: KerasTensor. Key tensor of shape (..., S, D).
+        value: KerasTensor. Value tensor of shape (..., S, D).
+        attn_mask: KerasTensor, optional. Attention mask tensor. Defaults to None.
+        dropout_p: float, optional. Dropout probability. Defaults to 0.0.
+        is_causal: bool, optional. If True, applies causal masking. Defaults to False.
+        scale: float, optional. Scale factor for attention. Defaults to None.
 
     Returns:
         KerasTensor: The output tensor from the attention mechanism.

From b407ffc3d9cb1d497ef0054bf68275dd989ea810 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 8 Oct 2024 20:03:50 +0900
Subject: [PATCH 46/68] remove numpy

---
 tools/checkpoint_conversion/convert_flux_checkpoints.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/checkpoint_conversion/convert_flux_checkpoints.py b/tools/checkpoint_conversion/convert_flux_checkpoints.py
index e44813745a..aa5074360c 100644
--- a/tools/checkpoint_conversion/convert_flux_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_flux_checkpoints.py
@@ -16,7 +16,6 @@
 import os
 
 import keras
-import numpy as np
 from safetensors import safe_open
 
 from keras_hub.src.models.flux.flux_model import Flux

From ac430817e650e57b4d04ef7aa6aed079643fc225 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 8 Oct 2024 20:06:43 +0900
Subject: [PATCH 47/68] add docstrings for flux model

---
 keras_hub/src/models/flux/flux_model.py | 44 ++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index e615e0f134..53e9d5619c 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -10,7 +10,29 @@
 
 class Flux(keras.Model):
     """
-    Transformer model for flow matching on sequences.
+    Transformer model for flow matching on sequences,
+    utilizing a double-stream and single-stream block structure.
+
+    The model processes image and text data with associated positional and timestep
+    embeddings, and optionally applies guidance embedding. Double-stream blocks
+    handle separate image and text streams, while single-stream blocks combine
+    these streams. Ported from: https://github.com/black-forest-labs/flux
+
+    Args:
+        in_channels: int. The number of input channels.
+        hidden_size: int. The hidden size of the transformer, must be divisible by `num_heads`.
+        mlp_ratio: float. The ratio of the MLP dimension to the hidden size.
+        num_heads: int. The number of attention heads.
+        depth: int. The number of double-stream blocks.
+        depth_single_blocks: int. The number of single-stream blocks.
+        axes_dim: list[int]. A list of dimensions for the positional embedding axes.
+        theta: int. The base frequency for positional embeddings.
+        qkv_bias: bool. Whether to apply bias to the query, key, and value projections.
+        guidance_embed: bool. If True, applies guidance embedding in the model.
+
+    Raises:
+        ValueError: If `hidden_size` is not divisible by `num_heads`, or if `sum(axes_dim)` is not equal to the
+                    positional embedding dimension.
     """
 
     def __init__(
@@ -141,6 +163,26 @@ def call(
         y,
         guidance=None,
     ):
+        """
+        Forward pass through the Flux model.
+
+        Args:
+            img: KerasTensor. Image input tensor of shape (N, L, D) where N is the batch size,
+                 L is the sequence length, and D is the feature dimension.
+            img_ids: KerasTensor. Image ID input tensor of shape (N, L, D) corresponding
+                 to the image sequences.
+            txt: KerasTensor. Text input tensor of shape (N, L, D).
+            txt_ids: KerasTensor. Text ID input tensor of shape (N, L, D) corresponding
+                to the text sequences.
+            timesteps: KerasTensor. Timestep tensor used to compute positional embeddings.
+            y: KerasTensor. Additional vector input, such as target values.
+            guidance: KerasTensor, optional. Guidance input tensor used
+                in guidance-embedded models.
+
+        Returns:
+            KerasTensor: The output tensor of the model, processed through
+            double and single stream blocks and the final layer.
+        """
         if img.ndim != 3 or txt.ndim != 3:
             raise ValueError(
                 "Input img and txt tensors must have 3 dimensions."

From 4b585a07a2b23a297460804e83aec78f645780ee Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 8 Oct 2024 20:08:47 +0900
Subject: [PATCH 48/68] qkv bias -> use_bias

---
 keras_hub/src/models/flux/flux_layers.py           | 14 +++++++-------
 keras_hub/src/models/flux/flux_model.py            |  6 +++---
 .../convert_flux_checkpoints.py                    |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 53d9a0afa9..fc088f673a 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -185,17 +185,17 @@ class SelfAttention(keras.Model):
     Args:
         dim: int. Dimensionality of the input tensor.
         num_heads: int. Number of attention heads. Default is 8.
-        qkv_bias: bool. Whether to use bias in the query, key, value projection layers.
+        use_bias: bool. Whether to use bias in the query, key, value projection layers.
             Default is False.
     """
 
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+    def __init__(self, dim: int, num_heads: int = 8, use_bias: bool = False):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
         self.dim = dim
 
-        self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias)
+        self.qkv = layers.Dense(dim * 3, use_bias=use_bias)
         self.norm = QKNorm(head_dim)
         self.proj = layers.Dense(dim)
         self.attention = FluxRoPEAttention()
@@ -289,7 +289,7 @@ class DoubleStreamBlock(keras.Model):
         hidden_size: int. The hidden dimension size for the model.
         num_heads: int. The number of attention heads.
         mlp_ratio: float. The ratio of the MLP hidden dimension to the hidden size.
-        qkv_bias: bool, optional. Whether to include bias in QKV projection. Default is False.
+        use_bias: bool, optional. Whether to include bias in QKV projection. Default is False.
     """
 
     def __init__(
@@ -297,7 +297,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         mlp_ratio: float,
-        qkv_bias: bool = False,
+        use_bias: bool = False,
     ):
         super().__init__()
 
@@ -308,7 +308,7 @@ def __init__(
         self.img_mod = Modulation(hidden_size, double=True)
         self.img_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
         self.img_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+            dim=hidden_size, num_heads=num_heads, use_bias=use_bias
         )
 
         self.img_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
@@ -323,7 +323,7 @@ def __init__(
         self.txt_mod = Modulation(hidden_size, double=True)
         self.txt_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
         self.txt_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+            dim=hidden_size, num_heads=num_heads, use_bias=use_bias
         )
 
         self.txt_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 53e9d5619c..8dc1ce12dc 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -27,7 +27,7 @@ class Flux(keras.Model):
         depth_single_blocks: int. The number of single-stream blocks.
         axes_dim: list[int]. A list of dimensions for the positional embedding axes.
         theta: int. The base frequency for positional embeddings.
-        qkv_bias: bool. Whether to apply bias to the query, key, and value projections.
+        use_bias: bool. Whether to apply bias to the query, key, and value projections.
         guidance_embed: bool. If True, applies guidance embedding in the model.
 
     Raises:
@@ -45,7 +45,7 @@ def __init__(
         depth_single_blocks: int,
         axes_dim: list[int],
         theta: int,
-        qkv_bias: bool,
+        use_bias: bool,
         guidance_embed: bool,
     ):
         super().__init__()
@@ -79,7 +79,7 @@ def __init__(
                 self.hidden_size,
                 self.num_heads,
                 mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
+                use_bias=use_bias,
             )
             for _ in range(depth)
         ]
diff --git a/tools/checkpoint_conversion/convert_flux_checkpoints.py b/tools/checkpoint_conversion/convert_flux_checkpoints.py
index aa5074360c..73edb0af91 100644
--- a/tools/checkpoint_conversion/convert_flux_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_flux_checkpoints.py
@@ -214,7 +214,7 @@ def main(_):
         depth_single_blocks=38,
         axes_dim=[16, 56, 56],
         theta=10_000,
-        qkv_bias=True,
+        use_bias=True,
         guidance_embed=False,
     )
 

From a2facb27dd4e1363672ab5ac648087175992b05b Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 8 Oct 2024 20:11:36 +0900
Subject: [PATCH 49/68] docstring updates

---
 keras_hub/src/models/flux/flux_layers.py | 18 +++---------------
 keras_hub/src/models/flux/flux_maths.py  | 20 ++++----------------
 keras_hub/src/models/flux/flux_model.py  |  6 +++---
 3 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index fc088f673a..8be56657f0 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -1,15 +1,3 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from dataclasses import dataclass
 
 import keras
@@ -497,14 +485,14 @@ class LastLayer(keras.Model):
     Args:
         hidden_size: int. The hidden dimension size for the model.
         patch_size: int. The size of each patch.
-        out_channels: int. The number of output channels.
+        output_channels: int. The number of output channels.
     """
 
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+    def __init__(self, hidden_size: int, patch_size: int, output_channels: int):
         super().__init__()
         self.norm_final = keras.layers.LayerNormalization(epsilon=1e-6)
         self.linear = keras.layers.Dense(
-            patch_size * patch_size * out_channels, use_bias=True
+            patch_size * patch_size * output_channels, use_bias=True
         )
         self.adaLN_modulation = keras.Sequential(
             [
diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index 340a7f454f..e6f0647324 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -1,15 +1,3 @@
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import keras
 from einops import rearrange
 from keras import ops
@@ -19,7 +7,7 @@ class TimestepEmbedding(keras.layers.Layer):
     """
     Creates sinusoidal timestep embeddings.
 
-    Args:
+    Call arguments:
         t: KerasTensor of shape (N,), representing N indices, one per batch element.
             These values may be fractional.
         dim: int. The dimension of the output.
@@ -54,7 +42,7 @@ class RotaryPositionalEmbedding(keras.layers.Layer):
     """
     Applies Rotary Positional Embedding (RoPE) to the input tensor.
 
-    Args:
+    Call arguments:
         pos: KerasTensor. The positional tensor with shape (..., n, d).
         dim: int. The embedding dimension, should be even.
         theta: int. The base frequency.
@@ -78,7 +66,7 @@ class ApplyRoPE(keras.layers.Layer):
     """
     Applies the RoPE transformation to the query and key tensors.
 
-    Args:
+    Call arguments:
         xq: KerasTensor. The query tensor of shape (..., L, D).
         xk: KerasTensor. The key tensor of shape (..., L, D).
         freqs_cis: KerasTensor. The frequency complex numbers tensor with shape (..., 2).
@@ -111,7 +99,7 @@ class FluxRoPEAttention(keras.layers.Layer):
         dropout_p: float, optional. Dropout probability. Defaults to 0.0.
         is_causal: bool, optional. If True, applies causal masking. Defaults to False.
 
-    Call Args:
+    Call arguments:
         q: KerasTensor. Query tensor of shape (..., L, D).
         k: KerasTensor. Key tensor of shape (..., S, D).
         v: KerasTensor. Value tensor of shape (..., S, D).
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 8dc1ce12dc..94280d39a8 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -51,7 +51,7 @@ def __init__(
         super().__init__()
 
         self.in_channels = in_channels
-        self.out_channels = self.in_channels
+        self.output_channels = self.in_channels
         if hidden_size % num_heads != 0:
             raise ValueError(
                 f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
@@ -91,7 +91,7 @@ def __init__(
             for _ in range(depth_single_blocks)
         ]
 
-        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.final_layer = LastLayer(self.hidden_size, 1, self.output_channels)
         self.timestep_embedding = TimestepEmbedding()
         self.guidance_embed = guidance_embed
 
@@ -215,5 +215,5 @@ def call(
 
         img = self.final_layer(
             img, vec
-        )  # (N, T, patch_size ** 2 * out_channels)
+        )  # (N, T, patch_size ** 2 * output_channels)
         return img

From bd2ebe212152d3bfef0189c5bfc88331403a7213 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 8 Oct 2024 20:12:42 +0900
Subject: [PATCH 50/68] remove type hints

---
 keras_hub/src/models/flux/flux_model.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 94280d39a8..09e26b89f6 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -37,16 +37,16 @@ class Flux(keras.Model):
 
     def __init__(
         self,
-        in_channels: int,
-        hidden_size: int,
-        mlp_ratio: float,
-        num_heads: int,
-        depth: int,
-        depth_single_blocks: int,
-        axes_dim: list[int],
-        theta: int,
-        use_bias: bool,
-        guidance_embed: bool,
+        in_channels,
+        hidden_size,
+        mlp_ratio,
+        num_heads,
+        depth,
+        depth_single_blocks,
+        axes_dim,
+        theta,
+        use_bias,
+        guidance_embed,
     ):
         super().__init__()
 

From f48bbd2fd0f685ca6c39c9def26b7e048f148dd0 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 8 Oct 2024 20:19:27 +0900
Subject: [PATCH 51/68] all img->image, txt->text

---
 keras_hub/src/models/flux/flux_layers.py | 117 ++++++++++++-----------
 keras_hub/src/models/flux/flux_model.py  | 112 ++++++++++++----------
 2 files changed, 122 insertions(+), 107 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 8be56657f0..d3bcfd4fd8 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -293,14 +293,14 @@ def __init__(
         self.num_heads = num_heads
         self.hidden_size = hidden_size
 
-        self.img_mod = Modulation(hidden_size, double=True)
-        self.img_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
-        self.img_attn = SelfAttention(
+        self.image_mod = Modulation(hidden_size, double=True)
+        self.image_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.image_attn = SelfAttention(
             dim=hidden_size, num_heads=num_heads, use_bias=use_bias
         )
 
-        self.img_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
-        self.img_mlp = keras.Sequential(
+        self.image_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.image_mlp = keras.Sequential(
             [
                 keras.layers.Dense(mlp_hidden_dim, use_bias=True),
                 keras.layers.Activation("gelu"),
@@ -308,14 +308,14 @@ def __init__(
             ]
         )
 
-        self.txt_mod = Modulation(hidden_size, double=True)
-        self.txt_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
-        self.txt_attn = SelfAttention(
+        self.text_mod = Modulation(hidden_size, double=True)
+        self.text_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.text_attn = SelfAttention(
             dim=hidden_size, num_heads=num_heads, use_bias=use_bias
         )
 
-        self.txt_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
-        self.txt_mlp = keras.Sequential(
+        self.text_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.text_mlp = keras.Sequential(
             [
                 keras.layers.Dense(mlp_hidden_dim, use_bias=True),
                 keras.layers.Activation("gelu"),
@@ -326,77 +326,84 @@ def __init__(
 
     def build(self, input_shape):
         # Build components for image and text streams
-        img_input_shape, txt_input_shape, vec_shape, pe_shape = input_shape
-        self.img_mod.build(vec_shape)
-        self.img_norm1.build(img_input_shape)
-        self.img_attn.build(
-            (img_input_shape[0], img_input_shape[1], self.hidden_size)
+        image_input_shape, text_input_shape, vec_shape, pe_shape = input_shape
+        self.image_mod.build(vec_shape)
+        self.image_norm1.build(image_input_shape)
+        self.image_attn.build(
+            (image_input_shape[0], image_input_shape[1], self.hidden_size)
         )
-        self.img_norm2.build(img_input_shape)
-        self.img_mlp.build(img_input_shape)
+        self.image_norm2.build(image_input_shape)
+        self.image_mlp.build(image_input_shape)
 
-        self.txt_mod.build(vec_shape)
-        self.txt_norm1.build(txt_input_shape)
-        self.txt_attn.build(
-            (txt_input_shape[0], txt_input_shape[1], self.hidden_size)
+        self.text_mod.build(vec_shape)
+        self.text_norm1.build(text_input_shape)
+        self.text_attn.build(
+            (text_input_shape[0], text_input_shape[1], self.hidden_size)
         )
-        self.txt_norm2.build(txt_input_shape)
-        self.txt_mlp.build(txt_input_shape)
+        self.text_norm2.build(text_input_shape)
+        self.text_mlp.build(text_input_shape)
 
-    def call(self, img, txt, vec, pe):
+    def call(self, image, text, vec, pe):
         """
         Forward pass for the DoubleStreamBlock.
 
         Args:
-            img: KerasTensor. Input image tensor.
-            txt: KerasTensor. Input text tensor.
+            image: KerasTensor. Input image tensor.
+            text: KerasTensor. Input text tensor.
             vec: KerasTensor. Modulation vector.
             pe: KerasTensor. Positional encoding tensor.
 
         Returns:
             Tuple[KerasTensor, KerasTensor]: The modified image and text tensors.
         """
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        image_mod1, image_mod2 = self.image_mod(vec)
+        text_mod1, text_mod2 = self.text_mod(vec)
 
         # prepare image for attention
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = rearrange(
-            img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        image_modulated = self.image_norm1(image)
+        image_modulated = (
+            1 + image_mod1.scale
+        ) * image_modulated + image_mod1.shift
+        image_qkv = self.image_attn.qkv(image_modulated)
+        image_q, image_k, image_v = rearrange(
+            image_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
         )
-        img_q, img_k = self.img_attn.norm(img_q, img_k)
-
-        # prepare txt for attention
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = rearrange(
-            txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        image_q, image_k = self.image_attn.norm(image_q, image_k)
+
+        # prepare text for attention
+        text_modulated = self.text_norm1(text)
+        text_modulated = (
+            1 + text_mod1.scale
+        ) * text_modulated + text_mod1.shift
+        text_qkv = self.text_attn.qkv(text_modulated)
+        text_q, text_k, text_v = rearrange(
+            text_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
         )
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k)
+        text_q, text_k = self.text_attn.norm(text_q, text_k)
 
         # run actual attention
-        q = keras.ops.concatenate((txt_q, img_q), axis=2)
-        k = keras.ops.concatenate((txt_k, img_k), axis=2)
-        v = keras.ops.concatenate((txt_v, img_v), axis=2)
+        q = keras.ops.concatenate((text_q, image_q), axis=2)
+        k = keras.ops.concatenate((text_k, image_k), axis=2)
+        v = keras.ops.concatenate((text_v, image_v), axis=2)
 
         attn = self.attention(q=q, k=k, v=v, pe=pe)
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        text_attn, image_attn = (
+            attn[:, : text.shape[1]],
+            attn[:, text.shape[1] :],
+        )
 
-        # calculate the img bloks
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp(
-            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        # calculate the image bloks
+        image = image + image_mod1.gate * self.image_attn.proj(image_attn)
+        image = image + image_mod2.gate * self.image_mlp(
+            (1 + image_mod2.scale) * self.image_norm2(image) + image_mod2.shift
         )
 
-        # calculate the txt bloks
-        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt = txt + txt_mod2.gate * self.txt_mlp(
-            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        # calculate the text bloks
+        text = text + text_mod1.gate * self.text_attn.proj(text_attn)
+        text = text + text_mod2.gate * self.text_mlp(
+            (1 + text_mod2.scale) * self.text_norm2(text) + text_mod2.shift
         )
-        return img, txt
+        return image, text
 
 
 class SingleStreamBlock(keras.Model):
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 09e26b89f6..fb4040a131 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -19,7 +19,7 @@ class Flux(keras.Model):
     these streams. Ported from: https://github.com/black-forest-labs/flux
 
     Args:
-        in_channels: int. The number of input channels.
+        input_channels: int. The number of input channels.
         hidden_size: int. The hidden size of the transformer, must be divisible by `num_heads`.
         mlp_ratio: float. The ratio of the MLP dimension to the hidden size.
         num_heads: int. The number of attention heads.
@@ -37,7 +37,7 @@ class Flux(keras.Model):
 
     def __init__(
         self,
-        in_channels,
+        input_channels,
         hidden_size,
         mlp_ratio,
         num_heads,
@@ -50,8 +50,8 @@ def __init__(
     ):
         super().__init__()
 
-        self.in_channels = in_channels
-        self.output_channels = self.in_channels
+        self.input_channels = input_channels
+        self.output_channels = self.input_channels
         if hidden_size % num_heads != 0:
             raise ValueError(
                 f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
@@ -63,16 +63,20 @@ def __init__(
             )
         self.hidden_size = hidden_size
         self.num_heads = num_heads
-        self.pe_embedder = EmbedND(dim=pe_dim, theta=theta, axes_dim=axes_dim)
-        self.img_in = keras.layers.Dense(self.hidden_size, use_bias=True)
-        self.time_in = MLPEmbedder(hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(hidden_dim=self.hidden_size)
-        self.guidance_in = (
+        self.positional_embedder = EmbedND(
+            dim=pe_dim, theta=theta, axes_dim=axes_dim
+        )
+        self.image_input_embedder = keras.layers.Dense(
+            self.hidden_size, use_bias=True
+        )
+        self.time_input_embedder = MLPEmbedder(hidden_dim=self.hidden_size)
+        self.vector_embedder = MLPEmbedder(hidden_dim=self.hidden_size)
+        self.guidance_input_embedder = (
             MLPEmbedder(hidden_dim=self.hidden_size)
             if guidance_embed
             else keras.layers.Identity()
         )
-        self.txt_in = keras.layers.Dense(self.hidden_size)
+        self.text_input_embedder = keras.layers.Dense(self.hidden_size)
 
         self.double_blocks = [
             DoubleStreamBlock(
@@ -97,68 +101,70 @@ def __init__(
 
     def build(self, input_shape):
         (
-            img_shape,
-            img_ids_shape,
-            txt_shape,
-            txt_ids_shape,
+            image_shape,
+            image_ids_shape,
+            text_shape,
+            text_ids_shape,
             timestep_shape,
             y_shape,
             guidance_shape,
         ) = input_shape
 
         # Build input layers
-        self.img_in.build(img_shape)
-        self.txt_in.build(txt_shape)
+        self.image_input_embedder.build(image_shape)
+        self.text_input_embedder.build(text_shape)
 
         # Build timestep embedding and vector inputs
         self.timestep_embedding.build(timestep_shape)
-        self.time_in.build((None, 256))  # timestep embedding size is 256
-        self.vector_in.build(y_shape)
+        self.time_input_embedder.build(
+            (None, 256)
+        )  # timestep embedding size is 256
+        self.vector_embedder.build(y_shape)
 
         if self.guidance_embed:
             if guidance_shape is None:
                 raise ValueError(
                     "Guidance shape must be provided for guidance-distilled model."
                 )
-            self.guidance_in.build(
+            self.guidance_input_embedder.build(
                 (None, 256)
             )  # guidance embedding size is 256
 
         # Build positional embedder
         ids_shape = (
             None,
-            img_ids_shape[1] + txt_ids_shape[1],
-            img_ids_shape[2],
+            image_ids_shape[1] + text_ids_shape[1],
+            image_ids_shape[2],
         )
-        self.pe_embedder.build(ids_shape)
+        self.positional_embedder.build(ids_shape)
 
         # Build double stream blocks
         for block in self.double_blocks:
-            block.build((img_shape, txt_shape, (None, 256), ids_shape))
+            block.build((image_shape, text_shape, (None, 256), ids_shape))
 
         # Build single stream blocks
-        concat_img_shape = (
+        concat_image_shape = (
             None,
-            img_shape[1] + txt_shape[1],
+            image_shape[1] + text_shape[1],
             self.hidden_size,
         )  # Concatenated shape
         for block in self.single_blocks:
-            block.build((concat_img_shape, (None, 256), ids_shape))
+            block.build((concat_image_shape, (None, 256), ids_shape))
 
         # Build final layer
         # Adjusted to match expected input shape for the final layer
         self.final_layer.build(
-            (None, img_shape[1] + txt_shape[1], self.hidden_size)
+            (None, image_shape[1] + text_shape[1], self.hidden_size)
         )  # Concatenated shape
 
         self.built = True  # Mark as built
 
     def call(
         self,
-        img,
-        img_ids,
-        txt,
-        txt_ids,
+        image,
+        image_ids,
+        text,
+        text_ids,
         timesteps,
         y,
         guidance=None,
@@ -167,12 +173,12 @@ def call(
         Forward pass through the Flux model.
 
         Args:
-            img: KerasTensor. Image input tensor of shape (N, L, D) where N is the batch size,
+            image: KerasTensor. Image input tensor of shape (N, L, D) where N is the batch size,
                  L is the sequence length, and D is the feature dimension.
-            img_ids: KerasTensor. Image ID input tensor of shape (N, L, D) corresponding
+            image_ids: KerasTensor. Image ID input tensor of shape (N, L, D) corresponding
                  to the image sequences.
-            txt: KerasTensor. Text input tensor of shape (N, L, D).
-            txt_ids: KerasTensor. Text ID input tensor of shape (N, L, D) corresponding
+            text: KerasTensor. Text input tensor of shape (N, L, D).
+            text_ids: KerasTensor. Text ID input tensor of shape (N, L, D) corresponding
                 to the text sequences.
             timesteps: KerasTensor. Timestep tensor used to compute positional embeddings.
             y: KerasTensor. Additional vector input, such as target values.
@@ -183,37 +189,39 @@ def call(
             KerasTensor: The output tensor of the model, processed through
             double and single stream blocks and the final layer.
         """
-        if img.ndim != 3 or txt.ndim != 3:
+        if image.ndim != 3 or text.ndim != 3:
             raise ValueError(
-                "Input img and txt tensors must have 3 dimensions."
+                "Input image and text tensors must have 3 dimensions."
             )
 
-        # running on sequences img
-        img = self.img_in(img)
-        vec = self.time_in(self.timestep_embedding(timesteps, dim=256))
+        # running on sequences image
+        image = self.image_input_embedder(image)
+        vec = self.time_input_embedder(
+            self.timestep_embedding(timesteps, dim=256)
+        )
         if self.guidance_embed:
             if guidance is None:
                 raise ValueError(
                     "Didn't get guidance strength for guidance distilled model."
                 )
-            vec = vec + self.guidance_in(
+            vec = vec + self.guidance_input_embedder(
                 self.timestep_embedding(guidance, dim=256)
             )
-        vec = vec + self.vector_in(y)
-        txt = self.txt_in(txt)
+        vec = vec + self.vector_embedder(y)
+        text = self.text_input_embedder(text)
 
-        ids = keras.ops.concatenate((txt_ids, img_ids), axis=1)
-        pe = self.pe_embedder(ids)
+        ids = keras.ops.concatenate((text_ids, image_ids), axis=1)
+        pe = self.positional_embedder(ids)
 
         for block in self.double_blocks:
-            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+            image, text = block(image=image, text=text, vec=vec, pe=pe)
 
-        img = keras.ops.concatenate((txt, img), axis=1)
+        image = keras.ops.concatenate((text, image), axis=1)
         for block in self.single_blocks:
-            img = block(img, vec=vec, pe=pe)
-        img = img[:, txt.shape[1] :, ...]
+            image = block(image, vec=vec, pe=pe)
+        image = image[:, text.shape[1] :, ...]
 
-        img = self.final_layer(
-            img, vec
+        image = self.final_layer(
+            image, vec
         )  # (N, T, patch_size ** 2 * output_channels)
-        return img
+        return image

From cbad326b9c40bf595242af33384a363e8a857b98 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Mon, 14 Oct 2024 23:54:40 +0900
Subject: [PATCH 52/68] functional subclassing model

---
 keras_hub/api/models/__init__.py         |   1 +
 keras_hub/src/models/flux/__init__.py    |   4 +-
 keras_hub/src/models/flux/flux_layers.py |  68 +++++----
 keras_hub/src/models/flux/flux_model.py  | 179 ++++++++++++-----------
 4 files changed, 134 insertions(+), 118 deletions(-)

diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
index 1450ddceb3..e0e8e9ad26 100644
--- a/keras_hub/api/models/__init__.py
+++ b/keras_hub/api/models/__init__.py
@@ -153,6 +153,7 @@
 )
 from keras_hub.src.models.falcon.falcon_tokenizer import FalconTokenizer
 from keras_hub.src.models.feature_pyramid_backbone import FeaturePyramidBackbone
+from keras_hub.src.models.flux.flux_model import FluxBackbone
 from keras_hub.src.models.gemma.gemma_backbone import GemmaBackbone
 from keras_hub.src.models.gemma.gemma_causal_lm import GemmaCausalLM
 from keras_hub.src.models.gemma.gemma_causal_lm_preprocessor import (
diff --git a/keras_hub/src/models/flux/__init__.py b/keras_hub/src/models/flux/__init__.py
index 7d6f4f9637..02dffc1c4f 100644
--- a/keras_hub/src/models/flux/__init__.py
+++ b/keras_hub/src/models/flux/__init__.py
@@ -1,5 +1,5 @@
-from keras_hub.src.models.flux.flux_model import Flux
+from keras_hub.src.models.flux.flux_model import FluxBackbone
 from keras_hub.src.models.flux.flux_presets import presets
 from keras_hub.src.utils.preset_utils import register_presets
 
-register_presets(presets, Flux)
+register_presets(presets, FluxBackbone)
diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index d3bcfd4fd8..be1ed27739 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -282,10 +282,10 @@ class DoubleStreamBlock(keras.Model):
 
     def __init__(
         self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float,
-        use_bias: bool = False,
+        hidden_size,
+        num_heads,
+        mlp_ratio,
+        use_bias = False,
     ):
         super().__init__()
 
@@ -324,25 +324,6 @@ def __init__(
         )
         self.attention = FluxRoPEAttention()
 
-    def build(self, input_shape):
-        # Build components for image and text streams
-        image_input_shape, text_input_shape, vec_shape, pe_shape = input_shape
-        self.image_mod.build(vec_shape)
-        self.image_norm1.build(image_input_shape)
-        self.image_attn.build(
-            (image_input_shape[0], image_input_shape[1], self.hidden_size)
-        )
-        self.image_norm2.build(image_input_shape)
-        self.image_mlp.build(image_input_shape)
-
-        self.text_mod.build(vec_shape)
-        self.text_norm1.build(text_input_shape)
-        self.text_attn.build(
-            (text_input_shape[0], text_input_shape[1], self.hidden_size)
-        )
-        self.text_norm2.build(text_input_shape)
-        self.text_mlp.build(text_input_shape)
-
     def call(self, image, text, vec, pe):
         """
         Forward pass for the DoubleStreamBlock.
@@ -365,9 +346,14 @@ def call(self, image, text, vec, pe):
             1 + image_mod1.scale
         ) * image_modulated + image_mod1.shift
         image_qkv = self.image_attn.qkv(image_modulated)
-        image_q, image_k, image_v = rearrange(
-            image_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
-        )
+
+        B, L, _ = keras.ops.shape(image_qkv)
+        D = self.hidden_size // self.num_heads
+
+        image_qkv = keras.ops.reshape(image_qkv, (B, L, 3, self.num_heads, D))
+        image_q = image_qkv[:, :, 0]
+        image_k = image_qkv[:, :, 1]
+        image_v = image_qkv[:, :, 2]
         image_q, image_k = self.image_attn.norm(image_q, image_k)
 
         # prepare text for attention
@@ -376,9 +362,12 @@ def call(self, image, text, vec, pe):
             1 + text_mod1.scale
         ) * text_modulated + text_mod1.shift
         text_qkv = self.text_attn.qkv(text_modulated)
-        text_q, text_k, text_v = rearrange(
-            text_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
-        )
+        # Reshape the QKV tensor into Q, K, and V for text
+        text_qkv = keras.ops.reshape(text_qkv, (B, L, 3, self.num_heads, D))
+        text_q = text_qkv[:, :, 0]
+        text_k = text_qkv[:, :, 1]
+        text_v = text_qkv[:, :, 2]
+
         text_q, text_k = self.text_attn.norm(text_q, text_k)
 
         # run actual attention
@@ -404,6 +393,27 @@ def call(self, image, text, vec, pe):
             (1 + text_mod2.scale) * self.text_norm2(text) + text_mod2.shift
         )
         return image, text
+    
+
+    def build(self, image_shape, text_shape, vec_shape, pe_shape):
+        # Build components for image and text streams
+        self.image_mod.build(vec_shape)
+        #self.image_norm1.build(image_input_shape)
+        self.image_attn.build(
+            (image_shape[0], image_shape[1], self.hidden_size)
+        )
+        self.image_norm2.build(image_shape)
+        self.image_mlp.build(image_shape)
+
+        self.text_mod.build(vec_shape)
+        #self.text_norm1.build(text_input_shape)
+        self.text_attn.build(
+            (text_shape[0], text_shape[1], self.hidden_size)
+        )
+        #self.text_norm2.build(text_input_shape)
+        #self.text_mlp.build(text_input_shape)
+
+
 
 
 class SingleStreamBlock(keras.Model):
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index fb4040a131..1794714550 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -1,5 +1,7 @@
 import keras
 
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.backbone import Backbone
 from keras_hub.src.models.flux.flux_layers import DoubleStreamBlock
 from keras_hub.src.models.flux.flux_layers import EmbedND
 from keras_hub.src.models.flux.flux_layers import LastLayer
@@ -8,10 +10,10 @@
 from keras_hub.src.models.flux.flux_maths import TimestepEmbedding
 
 
-class Flux(keras.Model):
+@keras_hub_export("keras_hub.models.FluxBackbone")
+class FluxBackbone(Backbone):
     """
-    Transformer model for flow matching on sequences,
-    utilizing a double-stream and single-stream block structure.
+    Transformer model for flow matching on sequences.
 
     The model processes image and text data with associated positional and timestep
     embeddings, and optionally applies guidance embedding. Double-stream blocks
@@ -30,6 +32,18 @@ class Flux(keras.Model):
         use_bias: bool. Whether to apply bias to the query, key, and value projections.
         guidance_embed: bool. If True, applies guidance embedding in the model.
 
+    Call arguments:
+        image: KerasTensor. Image input tensor of shape (N, L, D) where N is the batch size,
+                L is the sequence length, and D is the feature dimension.
+        image_ids: KerasTensor. Image ID input tensor of shape (N, L, D) corresponding
+                to the image sequences.
+        text: KerasTensor. Text input tensor of shape (N, L, D).
+        text_ids: KerasTensor. Text ID input tensor of shape (N, L, D) corresponding
+            to the text sequences.
+        timesteps: KerasTensor. Timestep tensor used to compute positional embeddings.
+        y: KerasTensor. Additional vector input, such as target values.
+        guidance: KerasTensor, optional. Guidance input tensor used
+            in guidance-embedded models.
     Raises:
         ValueError: If `hidden_size` is not divisible by `num_heads`, or if `sum(axes_dim)` is not equal to the
                     positional embedding dimension.
@@ -46,42 +60,47 @@ def __init__(
         axes_dim,
         theta,
         use_bias,
-        guidance_embed,
+        guidance_embed=False,
+        # These will be inferred from the CLIP/T5 encoders later
+        image_shape=(None, 768, 3072), 
+        text_shape=(None, 768, 3072),
+        image_ids_shape=(None, 768, 3072),
+        text_ids_shape=(None, 768, 3072),
+        y_shape=(128,),
+        timestep_shape=(256,),
+        guidance_shape=(256,),
+        **kwargs,
     ):
         super().__init__()
 
-        self.input_channels = input_channels
-        self.output_channels = self.input_channels
         if hidden_size % num_heads != 0:
             raise ValueError(
                 f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
             )
         pe_dim = hidden_size // num_heads
+
         if sum(axes_dim) != pe_dim:
             raise ValueError(
                 f"Got {axes_dim} but expected positional dim {pe_dim}"
             )
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.positional_embedder = EmbedND(
-            dim=pe_dim, theta=theta, axes_dim=axes_dim
-        )
+        # === Layers ===
+        self.positional_embedder = EmbedND(theta=theta, axes_dim=axes_dim)
         self.image_input_embedder = keras.layers.Dense(
-            self.hidden_size, use_bias=True
+            hidden_size, use_bias=True
         )
-        self.time_input_embedder = MLPEmbedder(hidden_dim=self.hidden_size)
-        self.vector_embedder = MLPEmbedder(hidden_dim=self.hidden_size)
+        self.time_input_embedder = MLPEmbedder(hidden_dim=hidden_size)
+        self.vector_embedder = MLPEmbedder(hidden_dim=hidden_size)
         self.guidance_input_embedder = (
-            MLPEmbedder(hidden_dim=self.hidden_size)
+            MLPEmbedder(hidden_dim=hidden_size)
             if guidance_embed
             else keras.layers.Identity()
         )
-        self.text_input_embedder = keras.layers.Dense(self.hidden_size)
+        self.text_input_embedder = keras.layers.Dense(hidden_size)
 
         self.double_blocks = [
             DoubleStreamBlock(
-                self.hidden_size,
-                self.num_heads,
+                hidden_size,
+                num_heads,
                 mlp_ratio=mlp_ratio,
                 use_bias=use_bias,
             )
@@ -90,14 +109,67 @@ def __init__(
 
         self.single_blocks = [
             SingleStreamBlock(
-                self.hidden_size, self.num_heads, mlp_ratio=mlp_ratio
+                hidden_size, num_heads, mlp_ratio=mlp_ratio
             )
             for _ in range(depth_single_blocks)
         ]
 
-        self.final_layer = LastLayer(self.hidden_size, 1, self.output_channels)
+        self.final_layer = LastLayer(hidden_size, 1, input_channels)
         self.timestep_embedding = TimestepEmbedding()
         self.guidance_embed = guidance_embed
+        # TODO: these come from external models
+        self.timesteps = keras.ops.arange(timestep_shape[0], dtype=float)
+        self.guidance = keras.ops.arange(guidance_shape[0], dtype=float)
+
+        # === Functional Model ===
+        image = keras.Input(shape=image_shape, name="image")
+        image_ids = keras.Input(shape=image_ids_shape, name="image_ids")
+        text = keras.Input(shape=text_shape, name="text")
+        text_ids = keras.Input(shape=text_ids_shape, name="text_ids")
+        y = keras.Input(shape=y_shape, name="y")
+
+        # running on sequences image
+        image = self.image_input_embedder(image)
+        vec = self.time_input_embedder(
+            self.timestep_embedding(self.timesteps, dim=256)
+        )
+        if self.guidance_embed:
+            if self.guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            vec = vec + self.guidance_input_embedder(
+                self.timestep_embedding(self.guidance, dim=256)
+            )
+        vec = vec + self.vector_embedder(y)
+        text = self.text_input_embedder(text)
+
+        ids = keras.ops.concatenate((text_ids, image_ids), axis=1)
+        pe = self.positional_embedder(ids)
+
+        for block in self.double_blocks:
+            image, text = block(image=image, text=text, vec=vec, pe=pe)
+
+        image = keras.ops.concatenate((text, image), axis=1)
+        for block in self.single_blocks:
+            image = block(image, vec=vec, pe=pe)
+        image = image[:, text.shape[1] :, ...]
+
+        image = self.final_layer(
+            image, vec
+        )  # (N, T, patch_size ** 2 * output_channels)
+
+        super().__init__(
+            inputs=[image, image_ids, text, text_ids, self.timesteps, y, self.guidance],
+            outputs=image,
+            **kwargs,
+        )
+
+        # === Config ===
+        self.input_channels = input_channels
+        self.output_channels = self.input_channels
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
 
     def build(self, input_shape):
         (
@@ -158,70 +230,3 @@ def build(self, input_shape):
         )  # Concatenated shape
 
         self.built = True  # Mark as built
-
-    def call(
-        self,
-        image,
-        image_ids,
-        text,
-        text_ids,
-        timesteps,
-        y,
-        guidance=None,
-    ):
-        """
-        Forward pass through the Flux model.
-
-        Args:
-            image: KerasTensor. Image input tensor of shape (N, L, D) where N is the batch size,
-                 L is the sequence length, and D is the feature dimension.
-            image_ids: KerasTensor. Image ID input tensor of shape (N, L, D) corresponding
-                 to the image sequences.
-            text: KerasTensor. Text input tensor of shape (N, L, D).
-            text_ids: KerasTensor. Text ID input tensor of shape (N, L, D) corresponding
-                to the text sequences.
-            timesteps: KerasTensor. Timestep tensor used to compute positional embeddings.
-            y: KerasTensor. Additional vector input, such as target values.
-            guidance: KerasTensor, optional. Guidance input tensor used
-                in guidance-embedded models.
-
-        Returns:
-            KerasTensor: The output tensor of the model, processed through
-            double and single stream blocks and the final layer.
-        """
-        if image.ndim != 3 or text.ndim != 3:
-            raise ValueError(
-                "Input image and text tensors must have 3 dimensions."
-            )
-
-        # running on sequences image
-        image = self.image_input_embedder(image)
-        vec = self.time_input_embedder(
-            self.timestep_embedding(timesteps, dim=256)
-        )
-        if self.guidance_embed:
-            if guidance is None:
-                raise ValueError(
-                    "Didn't get guidance strength for guidance distilled model."
-                )
-            vec = vec + self.guidance_input_embedder(
-                self.timestep_embedding(guidance, dim=256)
-            )
-        vec = vec + self.vector_embedder(y)
-        text = self.text_input_embedder(text)
-
-        ids = keras.ops.concatenate((text_ids, image_ids), axis=1)
-        pe = self.positional_embedder(ids)
-
-        for block in self.double_blocks:
-            image, text = block(image=image, text=text, vec=vec, pe=pe)
-
-        image = keras.ops.concatenate((text, image), axis=1)
-        for block in self.single_blocks:
-            image = block(image, vec=vec, pe=pe)
-        image = image[:, text.shape[1] :, ...]
-
-        image = self.final_layer(
-            image, vec
-        )  # (N, T, patch_size ** 2 * output_channels)
-        return image

From eeb8e0dd7a3d94538cf1d02d49e5cfd281e24738 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 16:34:10 +0900
Subject: [PATCH 53/68] shape fixes

---
 keras_hub/src/models/flux/flux_layers.py | 75 +++++++++++++++---------
 keras_hub/src/models/flux/flux_model.py  | 12 ++--
 2 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index be1ed27739..28a8ed0916 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -1,7 +1,6 @@
 from dataclasses import dataclass
 
 import keras
-from einops import rearrange
 from keras import KerasTensor
 from keras import layers
 from keras import ops
@@ -206,9 +205,17 @@ def call(self, x, pe):
             KerasTensor: Output tensor after self-attention and projection.
         """
         qkv = self.qkv(x)
-        q, k, v = rearrange(
-            qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
-        )
+
+        # Mimics rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        B, L, _ = keras.ops.shape(qkv)
+        D = self.hidden_size // self.num_heads
+
+        qkv = keras.ops.reshape(qkv, (B, L, 3, self.num_heads, D))
+        qkv = keras.ops.transpose(qkv, (2, 0, 3, 1, 4))
+        q = qkv[:, :, 0]
+        k = qkv[:, :, 1]
+        v = qkv[:, :, 2]
+
         q, k = self.norm(q, k)
         x = self.attention(q=q, k=k, v=v, pe=pe)
         x = self.proj(x)
@@ -285,7 +292,7 @@ def __init__(
         hidden_size,
         num_heads,
         mlp_ratio,
-        use_bias = False,
+        use_bias=False,
     ):
         super().__init__()
 
@@ -350,7 +357,9 @@ def call(self, image, text, vec, pe):
         B, L, _ = keras.ops.shape(image_qkv)
         D = self.hidden_size // self.num_heads
 
+        # Mimics rearrange(image_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         image_qkv = keras.ops.reshape(image_qkv, (B, L, 3, self.num_heads, D))
+        image_qkv = keras.ops.transpose(image_qkv, (2, 0, 3, 1, 4))
         image_q = image_qkv[:, :, 0]
         image_k = image_qkv[:, :, 1]
         image_v = image_qkv[:, :, 2]
@@ -362,8 +371,11 @@ def call(self, image, text, vec, pe):
             1 + text_mod1.scale
         ) * text_modulated + text_mod1.shift
         text_qkv = self.text_attn.qkv(text_modulated)
-        # Reshape the QKV tensor into Q, K, and V for text
+
+        # Mimics rearrange(text_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         text_qkv = keras.ops.reshape(text_qkv, (B, L, 3, self.num_heads, D))
+        text_qkv = keras.ops.transpose(text_qkv, (2, 0, 3, 1, 4))
+
         text_q = text_qkv[:, :, 0]
         text_k = text_qkv[:, :, 1]
         text_v = text_qkv[:, :, 2]
@@ -381,24 +393,22 @@ def call(self, image, text, vec, pe):
             attn[:, text.shape[1] :],
         )
 
-        # calculate the image bloks
+        # calculate the image blocks
         image = image + image_mod1.gate * self.image_attn.proj(image_attn)
         image = image + image_mod2.gate * self.image_mlp(
             (1 + image_mod2.scale) * self.image_norm2(image) + image_mod2.shift
         )
 
-        # calculate the text bloks
+        # calculate the text blocks
         text = text + text_mod1.gate * self.text_attn.proj(text_attn)
         text = text + text_mod2.gate * self.text_mlp(
             (1 + text_mod2.scale) * self.text_norm2(text) + text_mod2.shift
         )
         return image, text
-    
 
-    def build(self, image_shape, text_shape, vec_shape, pe_shape):
+    def build(self, image_shape, text_shape, vec_shape):
         # Build components for image and text streams
         self.image_mod.build(vec_shape)
-        #self.image_norm1.build(image_input_shape)
         self.image_attn.build(
             (image_shape[0], image_shape[1], self.hidden_size)
         )
@@ -406,14 +416,7 @@ def build(self, image_shape, text_shape, vec_shape, pe_shape):
         self.image_mlp.build(image_shape)
 
         self.text_mod.build(vec_shape)
-        #self.text_norm1.build(text_input_shape)
-        self.text_attn.build(
-            (text_shape[0], text_shape[1], self.hidden_size)
-        )
-        #self.text_norm2.build(text_input_shape)
-        #self.text_mlp.build(text_input_shape)
-
-
+        self.text_attn.build((text_shape[0], text_shape[1], self.hidden_size))
 
 
 class SingleStreamBlock(keras.Model):
@@ -454,12 +457,22 @@ def __init__(
         self.modulation = Modulation(hidden_size, double=False)
         self.attention = FluxRoPEAttention()
 
-    def build(self, input_shape):
-        x_shape, vec_shape, pe_shape = input_shape
-        self.modulation.build(vec_shape)
-        self.pre_norm.build(x_shape)
+    def build(self, x_shape, vec_shape, pe_shape):
         self.linear1.build(x_shape)
-        self.linear2.build((x_shape[0], x_shape[1], self.hidden_size))
+        self.linear2.build(
+            (x_shape[0], x_shape[1], self.hidden_size + self.mlp_hidden_dim)
+        )
+
+        self.modulation.build(vec_shape)  # Build the modulation layer
+
+        self.norm.build(
+            (
+                x_shape[0],
+                self.num_heads,
+                x_shape[1],
+                x_shape[-1] // self.num_heads,
+            )
+        )
 
     def call(self, x, vec, pe):
         """
@@ -479,9 +492,17 @@ def call(self, x, vec, pe):
             self.linear1(x_mod), [3 * self.hidden_size], axis=-1
         )
 
-        q, k, v = rearrange(
-            qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
-        )
+        # Mimics rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        B, L, _ = keras.ops.shape(qkv)
+        D = self.hidden_size // self.num_heads
+
+        qkv = keras.ops.reshape(qkv, (B, L, 3, self.num_heads, D))
+        qkv = keras.ops.transpose(qkv, (2, 0, 3, 1, 4))
+        
+        q = qkv[:, :, 0]
+        k = qkv[:, :, 1]
+        v = qkv[:, :, 2]
+
         q, k = self.norm(q, k)
 
         # compute attention
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 1794714550..885da9c4d7 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -62,11 +62,11 @@ def __init__(
         use_bias,
         guidance_embed=False,
         # These will be inferred from the CLIP/T5 encoders later
-        image_shape=(None, 768, 3072), 
+        image_shape=(None, 768, 3072),
         text_shape=(None, 768, 3072),
         image_ids_shape=(None, 768, 3072),
         text_ids_shape=(None, 768, 3072),
-        y_shape=(128,),
+        y_shape=(None, 128),
         timestep_shape=(256,),
         guidance_shape=(256,),
         **kwargs,
@@ -108,15 +108,14 @@ def __init__(
         ]
 
         self.single_blocks = [
-            SingleStreamBlock(
-                hidden_size, num_heads, mlp_ratio=mlp_ratio
-            )
+            SingleStreamBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio)
             for _ in range(depth_single_blocks)
         ]
 
         self.final_layer = LastLayer(hidden_size, 1, input_channels)
         self.timestep_embedding = TimestepEmbedding()
         self.guidance_embed = guidance_embed
+
         # TODO: these come from external models
         self.timesteps = keras.ops.arange(timestep_shape[0], dtype=float)
         self.guidance = keras.ops.arange(guidance_shape[0], dtype=float)
@@ -141,6 +140,7 @@ def __init__(
             vec = vec + self.guidance_input_embedder(
                 self.timestep_embedding(self.guidance, dim=256)
             )
+
         vec = vec + self.vector_embedder(y)
         text = self.text_input_embedder(text)
 
@@ -160,7 +160,7 @@ def __init__(
         )  # (N, T, patch_size ** 2 * output_channels)
 
         super().__init__(
-            inputs=[image, image_ids, text, text_ids, self.timesteps, y, self.guidance],
+            inputs=[image, image_ids, text, text_ids, y],
             outputs=image,
             **kwargs,
         )

From 330ed70a4dc74e9e00428bba906b67ce040be799 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 16:45:52 +0900
Subject: [PATCH 54/68] format

---
 keras_hub/src/models/flux/flux_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 28a8ed0916..b8ec43dabf 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -498,7 +498,7 @@ def call(self, x, vec, pe):
 
         qkv = keras.ops.reshape(qkv, (B, L, 3, self.num_heads, D))
         qkv = keras.ops.transpose(qkv, (2, 0, 3, 1, 4))
-        
+
         q = qkv[:, :, 0]
         k = qkv[:, :, 1]
         v = qkv[:, :, 2]

From 9233411fe0e5842915c85a43a25187563e8867ab Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 17:01:53 +0900
Subject: [PATCH 55/68] self.hidden_size -> self.dim

---
 keras_hub/src/models/flux/flux_layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index b8ec43dabf..37d626a299 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -208,7 +208,7 @@ def call(self, x, pe):
 
         # Mimics rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
         B, L, _ = keras.ops.shape(qkv)
-        D = self.hidden_size // self.num_heads
+        D = self.dim // self.num_heads
 
         qkv = keras.ops.reshape(qkv, (B, L, 3, self.num_heads, D))
         qkv = keras.ops.transpose(qkv, (2, 0, 3, 1, 4))

From ed2badc124d8a16b77426abdca6da867da683147 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 17:08:47 +0900
Subject: [PATCH 56/68] einops rearrange

---
 keras_hub/src/models/flux/flux_layers.py | 36 +++++++-----------------
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 37d626a299..4c37f1d4da 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 
 import keras
+from einops import rearrange
 from keras import KerasTensor
 from keras import layers
 from keras import ops
@@ -205,17 +206,9 @@ def call(self, x, pe):
             KerasTensor: Output tensor after self-attention and projection.
         """
         qkv = self.qkv(x)
-
-        # Mimics rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        B, L, _ = keras.ops.shape(qkv)
-        D = self.dim // self.num_heads
-
-        qkv = keras.ops.reshape(qkv, (B, L, 3, self.num_heads, D))
-        qkv = keras.ops.transpose(qkv, (2, 0, 3, 1, 4))
-        q = qkv[:, :, 0]
-        k = qkv[:, :, 1]
-        v = qkv[:, :, 2]
-
+        q, k, v = rearrange(
+            qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
         q, k = self.norm(q, k)
         x = self.attention(q=q, k=k, v=v, pe=pe)
         x = self.proj(x)
@@ -354,15 +347,10 @@ def call(self, image, text, vec, pe):
         ) * image_modulated + image_mod1.shift
         image_qkv = self.image_attn.qkv(image_modulated)
 
-        B, L, _ = keras.ops.shape(image_qkv)
-        D = self.hidden_size // self.num_heads
-
         # Mimics rearrange(image_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        image_qkv = keras.ops.reshape(image_qkv, (B, L, 3, self.num_heads, D))
-        image_qkv = keras.ops.transpose(image_qkv, (2, 0, 3, 1, 4))
-        image_q = image_qkv[:, :, 0]
-        image_k = image_qkv[:, :, 1]
-        image_v = image_qkv[:, :, 2]
+        image_q, image_k, image_v = rearrange(
+            image_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
         image_q, image_k = self.image_attn.norm(image_q, image_k)
 
         # prepare text for attention
@@ -372,13 +360,9 @@ def call(self, image, text, vec, pe):
         ) * text_modulated + text_mod1.shift
         text_qkv = self.text_attn.qkv(text_modulated)
 
-        # Mimics rearrange(text_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        text_qkv = keras.ops.reshape(text_qkv, (B, L, 3, self.num_heads, D))
-        text_qkv = keras.ops.transpose(text_qkv, (2, 0, 3, 1, 4))
-
-        text_q = text_qkv[:, :, 0]
-        text_k = text_qkv[:, :, 1]
-        text_v = text_qkv[:, :, 2]
+        text_q, text_k, text_v = rearrange(
+            text_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
 
         text_q, text_k = self.text_attn.norm(text_q, text_k)
 

From a65424b30cb8d3f25852fbc68e1f4069d5dc9ec7 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 17:17:07 +0900
Subject: [PATCH 57/68] remove build method

---
 keras_hub/src/models/flux/flux_layers.py | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 4c37f1d4da..9301591e0a 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -390,23 +390,13 @@ def call(self, image, text, vec, pe):
         )
         return image, text
 
-    def build(self, image_shape, text_shape, vec_shape):
-        # Build components for image and text streams
-        self.image_mod.build(vec_shape)
-        self.image_attn.build(
-            (image_shape[0], image_shape[1], self.hidden_size)
-        )
-        self.image_norm2.build(image_shape)
-        self.image_mlp.build(image_shape)
-
-        self.text_mod.build(vec_shape)
-        self.text_attn.build((text_shape[0], text_shape[1], self.hidden_size))
-
 
 class SingleStreamBlock(keras.Model):
     """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    A DiT block with parallel linear layers.
+
+    As described in https://arxiv.org/abs/2302.05442 and
+    adapted for the modulation interface.
 
     Args:
         hidden_size: int. The hidden dimension size for the model.

From cb11e28bedcaa5a7c32bf61edd3125ca501b267e Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 17:23:16 +0900
Subject: [PATCH 58/68] ops to rearrange

---
 keras_hub/src/models/flux/flux_layers.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 9301591e0a..1f70120a46 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -466,17 +466,9 @@ def call(self, x, vec, pe):
             self.linear1(x_mod), [3 * self.hidden_size], axis=-1
         )
 
-        # Mimics rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        B, L, _ = keras.ops.shape(qkv)
-        D = self.hidden_size // self.num_heads
-
-        qkv = keras.ops.reshape(qkv, (B, L, 3, self.num_heads, D))
-        qkv = keras.ops.transpose(qkv, (2, 0, 3, 1, 4))
-
-        q = qkv[:, :, 0]
-        k = qkv[:, :, 1]
-        v = qkv[:, :, 2]
-
+        q, k, v = rearrange(
+            qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
         q, k = self.norm(q, k)
 
         # compute attention

From f478f3960d237d6ed32e0dd9d8c85b36b14a9bc4 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 17:28:06 +0900
Subject: [PATCH 59/68] remove build

---
 keras_hub/src/models/flux/flux_layers.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 1f70120a46..204e342067 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -505,12 +505,6 @@ def __init__(self, hidden_size: int, patch_size: int, output_channels: int):
             ]
         )
 
-    def build(self, input_shape):
-        _, _, features = input_shape
-
-        self.linear.build((None, features))
-        self.built = True
-
     def call(self, x, vec):
         """
         Forward pass for the LastLayer.

From 3b5cb4d6cb7a6e903f3847634194bf075b63e2a2 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 19:36:48 +0900
Subject: [PATCH 60/68] rearrange -> symbolic_rearrange

---
 keras_hub/src/models/flux/flux_layers.py | 19 +++----
 keras_hub/src/models/flux/flux_maths.py  | 31 +++++++++++
 keras_hub/src/models/flux/flux_model.py  | 70 ++----------------------
 3 files changed, 43 insertions(+), 77 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_layers.py b/keras_hub/src/models/flux/flux_layers.py
index 204e342067..5ff3a02d0e 100644
--- a/keras_hub/src/models/flux/flux_layers.py
+++ b/keras_hub/src/models/flux/flux_layers.py
@@ -1,13 +1,13 @@
 from dataclasses import dataclass
 
 import keras
-from einops import rearrange
 from keras import KerasTensor
 from keras import layers
 from keras import ops
 
 from keras_hub.src.models.flux.flux_maths import FluxRoPEAttention
 from keras_hub.src.models.flux.flux_maths import RotaryPositionalEmbedding
+from keras_hub.src.models.flux.flux_maths import rearrange_symbolic_tensors
 
 
 class EmbedND(keras.Model):
@@ -206,9 +206,7 @@ def call(self, x, pe):
             KerasTensor: Output tensor after self-attention and projection.
         """
         qkv = self.qkv(x)
-        q, k, v = rearrange(
-            qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
-        )
+        q, k, v = rearrange_symbolic_tensors(qkv, K=3, H=self.num_heads)
         q, k = self.norm(q, k)
         x = self.attention(q=q, k=k, v=v, pe=pe)
         x = self.proj(x)
@@ -347,9 +345,8 @@ def call(self, image, text, vec, pe):
         ) * image_modulated + image_mod1.shift
         image_qkv = self.image_attn.qkv(image_modulated)
 
-        # Mimics rearrange(image_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
-        image_q, image_k, image_v = rearrange(
-            image_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        image_q, image_k, image_v = rearrange_symbolic_tensors(
+            image_qkv, K=3, H=self.num_heads
         )
         image_q, image_k = self.image_attn.norm(image_q, image_k)
 
@@ -360,8 +357,8 @@ def call(self, image, text, vec, pe):
         ) * text_modulated + text_mod1.shift
         text_qkv = self.text_attn.qkv(text_modulated)
 
-        text_q, text_k, text_v = rearrange(
-            text_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        text_q, text_k, text_v = rearrange_symbolic_tensors(
+            text_qkv, K=3, H=self.num_heads
         )
 
         text_q, text_k = self.text_attn.norm(text_q, text_k)
@@ -466,9 +463,7 @@ def call(self, x, vec, pe):
             self.linear1(x_mod), [3 * self.hidden_size], axis=-1
         )
 
-        q, k, v = rearrange(
-            qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
-        )
+        q, k, v = rearrange_symbolic_tensors(qkv, K=3, H=self.num_heads)
         q, k = self.norm(q, k)
 
         # compute attention
diff --git a/keras_hub/src/models/flux/flux_maths.py b/keras_hub/src/models/flux/flux_maths.py
index e6f0647324..2cfb4133f2 100644
--- a/keras_hub/src/models/flux/flux_maths.py
+++ b/keras_hub/src/models/flux/flux_maths.py
@@ -186,3 +186,34 @@ def scaled_dot_product_attention(
         )
 
     return ops.matmul(attn_weight, value)
+
+
+def rearrange_symbolic_tensors(qkv, K, H):
+    """
+    Splits the qkv tensor into query (q), key (k), and value (v) components.
+
+    Mimics rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=num_heads),
+    for graph-mode TensorFlow support when doing functional subclassing
+    models.
+
+    Arguments:
+        qkv: np.ndarray. Input tensor of shape (B, L, K*H*D).
+        K: int. Number of components (q, k, v).
+        H: int. Number of attention heads.
+
+    Returns:
+        tuple: q, k, v tensors of shape (B, H, L, D).
+    """
+    # Get the shape of qkv and calculate L and D
+    B, L, dim = ops.shape(qkv)
+    D = dim // (K * H)
+
+    # Reshape and transpose the qkv tensor
+    qkv_reshaped = ops.reshape(qkv, (B, L, K, H, D))
+    qkv_transposed = ops.transpose(qkv_reshaped, (2, 0, 3, 1, 4))
+
+    # Split q, k, v along the first dimension (K)
+    qkv_splits = ops.split(qkv_transposed, K, axis=0)
+    q, k, v = [ops.squeeze(split, 0) for split in qkv_splits]
+
+    return q, k, v
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 885da9c4d7..f09abb2303 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -121,14 +121,14 @@ def __init__(
         self.guidance = keras.ops.arange(guidance_shape[0], dtype=float)
 
         # === Functional Model ===
-        image = keras.Input(shape=image_shape, name="image")
+        image_input = keras.Input(shape=image_shape, name="image")
         image_ids = keras.Input(shape=image_ids_shape, name="image_ids")
-        text = keras.Input(shape=text_shape, name="text")
+        text_input = keras.Input(shape=text_shape, name="text")
         text_ids = keras.Input(shape=text_ids_shape, name="text_ids")
         y = keras.Input(shape=y_shape, name="y")
 
         # running on sequences image
-        image = self.image_input_embedder(image)
+        image = self.image_input_embedder(image_input)
         vec = self.time_input_embedder(
             self.timestep_embedding(self.timesteps, dim=256)
         )
@@ -142,7 +142,7 @@ def __init__(
             )
 
         vec = vec + self.vector_embedder(y)
-        text = self.text_input_embedder(text)
+        text = self.text_input_embedder(text_input)
 
         ids = keras.ops.concatenate((text_ids, image_ids), axis=1)
         pe = self.positional_embedder(ids)
@@ -160,7 +160,7 @@ def __init__(
         )  # (N, T, patch_size ** 2 * output_channels)
 
         super().__init__(
-            inputs=[image, image_ids, text, text_ids, y],
+            inputs=[image_input, image_ids, text_input, text_ids, y],
             outputs=image,
             **kwargs,
         )
@@ -170,63 +170,3 @@ def __init__(
         self.output_channels = self.input_channels
         self.hidden_size = hidden_size
         self.num_heads = num_heads
-
-    def build(self, input_shape):
-        (
-            image_shape,
-            image_ids_shape,
-            text_shape,
-            text_ids_shape,
-            timestep_shape,
-            y_shape,
-            guidance_shape,
-        ) = input_shape
-
-        # Build input layers
-        self.image_input_embedder.build(image_shape)
-        self.text_input_embedder.build(text_shape)
-
-        # Build timestep embedding and vector inputs
-        self.timestep_embedding.build(timestep_shape)
-        self.time_input_embedder.build(
-            (None, 256)
-        )  # timestep embedding size is 256
-        self.vector_embedder.build(y_shape)
-
-        if self.guidance_embed:
-            if guidance_shape is None:
-                raise ValueError(
-                    "Guidance shape must be provided for guidance-distilled model."
-                )
-            self.guidance_input_embedder.build(
-                (None, 256)
-            )  # guidance embedding size is 256
-
-        # Build positional embedder
-        ids_shape = (
-            None,
-            image_ids_shape[1] + text_ids_shape[1],
-            image_ids_shape[2],
-        )
-        self.positional_embedder.build(ids_shape)
-
-        # Build double stream blocks
-        for block in self.double_blocks:
-            block.build((image_shape, text_shape, (None, 256), ids_shape))
-
-        # Build single stream blocks
-        concat_image_shape = (
-            None,
-            image_shape[1] + text_shape[1],
-            self.hidden_size,
-        )  # Concatenated shape
-        for block in self.single_blocks:
-            block.build((concat_image_shape, (None, 256), ids_shape))
-
-        # Build final layer
-        # Adjusted to match expected input shape for the final layer
-        self.final_layer.build(
-            (None, image_shape[1] + text_shape[1], self.hidden_size)
-        )  # Concatenated shape
-
-        self.built = True  # Mark as built

From 40178e17cf2b4ec8993151a55dca81a4b367a94f Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 20:12:19 +0900
Subject: [PATCH 61/68] turn timesteps and guidance into inputs

---
 keras_hub/src/models/flux/flux_model.py | 30 ++++++++++++++++++-------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index f09abb2303..4b78d2e3e1 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -116,29 +116,35 @@ def __init__(
         self.timestep_embedding = TimestepEmbedding()
         self.guidance_embed = guidance_embed
 
-        # TODO: these come from external models
-        self.timesteps = keras.ops.arange(timestep_shape[0], dtype=float)
-        self.guidance = keras.ops.arange(guidance_shape[0], dtype=float)
-
         # === Functional Model ===
         image_input = keras.Input(shape=image_shape, name="image")
         image_ids = keras.Input(shape=image_ids_shape, name="image_ids")
         text_input = keras.Input(shape=text_shape, name="text")
         text_ids = keras.Input(shape=text_ids_shape, name="text_ids")
         y = keras.Input(shape=y_shape, name="y")
+        timesteps_input = keras.Input(
+            shape=timestep_shape, batch_size=1, name="timesteps"
+        )
+        guidance_input = keras.Input(
+            shape=guidance_shape, batch_size=1, name="guidance"
+        )
+
+        # These should be unbatched. Is there a nicer way to do this in Keras?
+        timesteps_input = keras.ops.squeeze(timesteps_input, axis=0)
+        guidance_input = keras.ops.squeeze(guidance_input, axis=0)
 
         # running on sequences image
         image = self.image_input_embedder(image_input)
         vec = self.time_input_embedder(
-            self.timestep_embedding(self.timesteps, dim=256)
+            self.timestep_embedding(timesteps_input, dim=256)
         )
         if self.guidance_embed:
-            if self.guidance is None:
+            if guidance_input is None:
                 raise ValueError(
                     "Didn't get guidance strength for guidance distilled model."
                 )
             vec = vec + self.guidance_input_embedder(
-                self.timestep_embedding(self.guidance, dim=256)
+                self.timestep_embedding(guidance_input, dim=256)
             )
 
         vec = vec + self.vector_embedder(y)
@@ -160,7 +166,15 @@ def __init__(
         )  # (N, T, patch_size ** 2 * output_channels)
 
         super().__init__(
-            inputs=[image_input, image_ids, text_input, text_ids, y],
+            inputs=[
+                image_input,
+                image_ids,
+                text_input,
+                text_ids,
+                y,
+                timesteps_input,
+                guidance_input,
+            ],
             outputs=image,
             **kwargs,
         )

From 078459d62690fabbaca0ee942250311b51859be3 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 20:32:34 +0900
Subject: [PATCH 62/68] basic preprocessor flow

---
 keras_hub/api/models/__init__.py              |   4 +
 keras_hub/src/models/flux/flux_model.py       |  57 +++++++
 .../src/models/flux/flux_text_to_image.py     | 142 ++++++++++++++++++
 .../flux/flux_text_to_image_preprocessor.py   |  73 +++++++++
 4 files changed, 276 insertions(+)
 create mode 100644 keras_hub/src/models/flux/flux_text_to_image.py
 create mode 100644 keras_hub/src/models/flux/flux_text_to_image_preprocessor.py

diff --git a/keras_hub/api/models/__init__.py b/keras_hub/api/models/__init__.py
index e0e8e9ad26..00a04a266a 100644
--- a/keras_hub/api/models/__init__.py
+++ b/keras_hub/api/models/__init__.py
@@ -154,6 +154,10 @@
 from keras_hub.src.models.falcon.falcon_tokenizer import FalconTokenizer
 from keras_hub.src.models.feature_pyramid_backbone import FeaturePyramidBackbone
 from keras_hub.src.models.flux.flux_model import FluxBackbone
+from keras_hub.src.models.flux.flux_text_to_image import FluxTextToImage
+from keras_hub.src.models.flux.flux_text_to_image_preprocessor import (
+    FluxTextToImagePreprocessor,
+)
 from keras_hub.src.models.gemma.gemma_backbone import GemmaBackbone
 from keras_hub.src.models.gemma.gemma_causal_lm import GemmaCausalLM
 from keras_hub.src.models.gemma.gemma_causal_lm_preprocessor import (
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 4b78d2e3e1..829bbcdd9c 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -1,4 +1,5 @@
 import keras
+from keras import ops
 
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.backbone import Backbone
@@ -184,3 +185,59 @@ def __init__(
         self.output_channels = self.input_channels
         self.hidden_size = hidden_size
         self.num_heads = num_heads
+
+    def encode_text_step(self, token_ids, negative_token_ids):
+        clip_hidden_dim = self.clip_hidden_dim
+        t5_hidden_dim = self.t5_hidden_dim
+
+        def encode(token_ids):
+            clip_l_outputs = self.clip_l(token_ids["clip_l"], training=False)
+            clip_l_projection = self.clip_l_projection(
+                clip_l_outputs["sequence_output"],
+                token_ids["clip_l"],
+                training=False,
+            )
+
+            embeddings = ops.pad(
+                clip_l_outputs,
+                [[0, 0], [0, 0], [0, t5_hidden_dim - clip_hidden_dim]],
+            )
+            if self.t5 is not None:
+                t5_outputs = self.t5(token_ids["t5"], training=False)
+                embeddings = ops.concatenate([embeddings, t5_outputs], axis=-2)
+            else:
+                padded_size = self.clip_l.max_sequence_length
+                embeddings = ops.pad(
+                    embeddings, [[0, 0], [0, padded_size], [0, 0]]
+                )
+            return embeddings, clip_l_projection
+
+        positive_embeddings, positive_pooled_embeddings = encode(token_ids)
+        negative_embeddings, negative_pooled_embeddings = encode(
+            negative_token_ids
+        )
+        return (
+            positive_embeddings,
+            negative_embeddings,
+            positive_pooled_embeddings,
+            negative_pooled_embeddings,
+        )
+
+    def encode_image_step(self, images):
+        raise NotImplementedError("Not implemented yet")
+
+    def add_noise_step(self, latents, noises, step, num_steps):
+        raise NotImplementedError("Not implemented yet")
+
+    def denoise_step(
+        self,
+        latents,
+        embeddings,
+        step,
+        num_steps,
+        guidance_scale,
+    ):
+        raise NotImplementedError("Not implemented yet")
+
+    def decode_step(self, latents):
+        raise NotImplementedError("Not implemented yet")
diff --git a/keras_hub/src/models/flux/flux_text_to_image.py b/keras_hub/src/models/flux/flux_text_to_image.py
new file mode 100644
index 0000000000..792718214c
--- /dev/null
+++ b/keras_hub/src/models/flux/flux_text_to_image.py
@@ -0,0 +1,142 @@
+from keras import ops
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.flux.flux_model import FluxBackbone
+from keras_hub.src.models.flux.flux_text_to_image_preprocessor import (
+    FluxTextToImagePreprocessor,
+)
+from keras_hub.src.models.text_to_image import TextToImage
+
+
+@keras_hub_export("keras_hub.models.FluxTextToImage")
+class FluxTextToImage(TextToImage):
+    """An end-to-end Flux model for text-to-image generation.
+
+    This model has a `generate()` method, which generates image based on a
+    prompt.
+
+    Args:
+        backbone: A `keras_hub.models.FluxBackbone` instance.
+        preprocessor: A
+            `keras_hub.models.FluxTextToImagePreprocessor` instance.
+
+    Examples:
+
+    Use `generate()` to do image generation.
+    ```python
+    text_to_image = keras_hub.models.FluxTextToImage.from_preset(
+        "TBA", height=512, width=512
+    )
+    text_to_image.generate(
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+    )
+
+    # Generate with batched prompts.
+    text_to_image.generate(
+        ["cute wallpaper art of a cat", "cute wallpaper art of a dog"]
+    )
+
+    # Generate with different `num_steps` and `guidance_scale`.
+    text_to_image.generate(
+        "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+        num_steps=50,
+        guidance_scale=5.0,
+    )
+
+    # Generate with `negative_prompts`.
+    text_to_image.generate(
+        {
+            "prompts": "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+            "negative_prompts": "green color",
+        }
+    )
+    ```
+    """
+
+    backbone_cls = FluxBackbone
+    preprocessor_cls = FluxTextToImagePreprocessor
+
+    def __init__(
+        self,
+        backbone,
+        preprocessor,
+        **kwargs,
+    ):
+        # === Layers ===
+        self.backbone = backbone
+        self.preprocessor = preprocessor
+
+        # === Functional Model ===
+        inputs = backbone.input
+        outputs = backbone.output
+        super().__init__(
+            inputs=inputs,
+            outputs=outputs,
+            **kwargs,
+        )
+
+    def fit(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Currently, `fit` is not supported for " "`FluxTextToImage`."
+        )
+
+    def generate_step(
+        self,
+        latents,
+        token_ids,
+        num_steps,
+        guidance_scale,
+    ):
+        """A compilable generation function for batched of inputs.
+
+        This function represents the inner, XLA-compilable, generation function
+        for batched inputs.
+
+        Args:
+            latents: A (batch_size, height, width, channels) tensor
+                containing the latents to start generation from. Typically, this
+                tensor is sampled from the Gaussian distribution.
+            token_ids: A pair of (batch_size, num_tokens) tensor containing the
+                tokens based on the input prompts and negative prompts.
+            num_steps: int. The number of diffusion steps to take.
+            guidance_scale: float. The classifier free guidance scale defined in
+                [Classifier-Free Diffusion Guidance](
+                https://arxiv.org/abs/2207.12598). Higher scale encourages to
+                generate images that are closely linked to prompts, usually at
+                the expense of lower image quality.
+        """
+        token_ids, negative_token_ids = token_ids
+
+        # Encode prompts.
+        embeddings = self.backbone.encode_text_step(
+            token_ids, negative_token_ids
+        )
+
+        # Denoise.
+        def body_fun(step, latents):
+            return self.backbone.denoise_step(
+                latents,
+                embeddings,
+                step,
+                num_steps,
+                guidance_scale,
+            )
+
+        latents = ops.fori_loop(0, num_steps, body_fun, latents)
+
+        # Decode.
+        return self.backbone.decode_step(latents)
+
+    def generate(
+        self,
+        inputs,
+        num_steps=28,
+        guidance_scale=7.0,
+        seed=None,
+    ):
+        return super().generate(
+            inputs,
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            seed=seed,
+        )
diff --git a/keras_hub/src/models/flux/flux_text_to_image_preprocessor.py b/keras_hub/src/models/flux/flux_text_to_image_preprocessor.py
new file mode 100644
index 0000000000..6750850d41
--- /dev/null
+++ b/keras_hub/src/models/flux/flux_text_to_image_preprocessor.py
@@ -0,0 +1,73 @@
+import keras
+from keras import layers
+
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.models.flux.flux_model import FluxBackbone
+from keras_hub.src.models.preprocessor import Preprocessor
+
+
+@keras_hub_export("keras_hub.models.FluxTextToImagePreprocessor")
+class FluxTextToImagePreprocessor(Preprocessor):
+    """Flux text-to-image model preprocessor.
+
+    This preprocessing layer is meant for use with
+    `keras_hub.models.FluxTextToImagePreprocessor`.
+
+    For use with generation, the layer exposes one methods
+    `generate_preprocess()`.
+
+    Args:
+        clip_l_preprocessor: A `keras_hub.models.CLIPPreprocessor` instance.
+        t5_preprocessor: A optional `keras_hub.models.T5Preprocessor` instance.
+    """
+
+    backbone_cls = FluxBackbone
+
+    def __init__(
+        self,
+        clip_l_preprocessor,
+        t5_preprocessor=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.clip_l_preprocessor = clip_l_preprocessor
+        self.t5_preprocessor = t5_preprocessor
+
+    @property
+    def sequence_length(self):
+        """The padded length of model input sequences."""
+        return self.clip_l_preprocessor.sequence_length
+
+    def build(self, input_shape):
+        self.built = True
+
+    def generate_preprocess(self, x):
+        token_ids = {}
+        token_ids["clip_l"] = self.clip_l_preprocessor(x)["token_ids"]
+        if self.t5_preprocessor is not None:
+            token_ids["t5"] = self.t5_preprocessor(x)["token_ids"]
+        return token_ids
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "clip_l_preprocessor": layers.serialize(
+                    self.clip_l_preprocessor
+                ),
+                "t5_preprocessor": layers.serialize(self.t5_preprocessor),
+            }
+        )
+        return config
+
+    @classmethod
+    def from_config(cls, config):
+        for layer_name in (
+            "clip_l_preprocessor",
+            "t5_preprocessor",
+        ):
+            if layer_name in config and isinstance(config[layer_name], dict):
+                config[layer_name] = keras.layers.deserialize(
+                    config[layer_name]
+                )
+        return cls(**config)

From 0003b08ed987fbd7be0d9280510b1fd84819e2ff Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Tue, 15 Oct 2024 20:39:42 +0900
Subject: [PATCH 63/68] refactor layer names in conversion script

---
 .../convert_flux_checkpoints.py               | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/tools/checkpoint_conversion/convert_flux_checkpoints.py b/tools/checkpoint_conversion/convert_flux_checkpoints.py
index 73edb0af91..49d90826d2 100644
--- a/tools/checkpoint_conversion/convert_flux_checkpoints.py
+++ b/tools/checkpoint_conversion/convert_flux_checkpoints.py
@@ -58,36 +58,38 @@ def convert_modulation_weights(weights_dict, keras_model, prefix):
 def convert_doublestreamblock_weights(weights_dict, keras_model, block_idx):
     # Convert img_mod weights
     convert_modulation_weights(
-        weights_dict, keras_model.img_mod, f"double_blocks.{block_idx}.img_mod"
+        weights_dict,
+        keras_model.image_mod,
+        f"double_blocks.{block_idx}.img_mod",
     )
 
     # Convert txt_mod weights
     convert_modulation_weights(
-        weights_dict, keras_model.txt_mod, f"double_blocks.{block_idx}.txt_mod"
+        weights_dict, keras_model.text_mod, f"double_blocks.{block_idx}.txt_mod"
     )
 
     # Convert img_attn weights
     convert_selfattention_weights(
         weights_dict,
-        keras_model.img_attn,
+        keras_model.image_attn,
         f"double_blocks.{block_idx}.img_attn",
     )
 
     # Convert txt_attn weights
     convert_selfattention_weights(
         weights_dict,
-        keras_model.txt_attn,
+        keras_model.text_attention,
         f"double_blocks.{block_idx}.txt_attn",
     )
 
     # Convert img_mlp weights (2 layers)
-    keras_model.img_mlp.layers[0].set_weights(
+    keras_model.image_mlp.layers[0].set_weights(
         [
             weights_dict[f"double_blocks.{block_idx}.img_mlp.0.weight"].T,
             weights_dict[f"double_blocks.{block_idx}.img_mlp.0.bias"],
         ]
     )
-    keras_model.img_mlp.layers[2].set_weights(
+    keras_model.image_mlp.layers[2].set_weights(
         [
             weights_dict[f"double_blocks.{block_idx}.img_mlp.2.weight"].T,
             weights_dict[f"double_blocks.{block_idx}.img_mlp.2.bias"],
@@ -95,13 +97,13 @@ def convert_doublestreamblock_weights(weights_dict, keras_model, block_idx):
     )
 
     # Convert txt_mlp weights (2 layers)
-    keras_model.txt_mlp.layers[0].set_weights(
+    keras_model.text_mlp.layers[0].set_weights(
         [
             weights_dict[f"double_blocks.{block_idx}.txt_mlp.0.weight"].T,
             weights_dict[f"double_blocks.{block_idx}.txt_mlp.0.bias"],
         ]
     )
-    keras_model.txt_mlp.layers[2].set_weights(
+    keras_model.text_mlp.layers[2].set_weights(
         [
             weights_dict[f"double_blocks.{block_idx}.txt_mlp.2.weight"].T,
             weights_dict[f"double_blocks.{block_idx}.txt_mlp.2.bias"],
@@ -153,26 +155,28 @@ def convert_lastlayer_weights(weights_dict, keras_model):
 
 def convert_flux_weights(weights_dict, keras_model):
     # Convert img_in weights
-    keras_model.img_in.set_weights(
+    keras_model.image_input_embedder.set_weights(
         [weights_dict["img_in.weight"].T, weights_dict["img_in.bias"]]
     )
 
     # Convert time_in weights (MLPEmbedder)
-    convert_mlpembedder_weights(weights_dict, keras_model.time_in, "time_in")
+    convert_mlpembedder_weights(
+        weights_dict, keras_model.time_input_embedder, "time_in"
+    )
 
     # Convert vector_in weights (MLPEmbedder)
     convert_mlpembedder_weights(
-        weights_dict, keras_model.vector_in, "vector_in"
+        weights_dict, keras_model.vector_embedder, "vector_in"
     )
 
     # Convert guidance_in weights (if present)
     if hasattr(keras_model, "guidance_embed"):
         convert_mlpembedder_weights(
-            weights_dict, keras_model.guidance_in, "guidance_in"
+            weights_dict, keras_model.guidance_input_embedder, "guidance_in"
         )
 
     # Convert txt_in weights
-    keras_model.txt_in.set_weights(
+    keras_model.text_input_embedder.set_weights(
         [weights_dict["txt_in.weight"].T, weights_dict["txt_in.bias"]]
     )
 

From 71b564f34c3bf3670fe817e81561e2ff08168acb Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Wed, 16 Oct 2024 02:39:10 +0900
Subject: [PATCH 64/68] add backbone tests

---
 .../src/models/flux/flux_backbone_test.py     | 88 +++++++++++++++++++
 keras_hub/src/models/flux/flux_model.py       | 50 ++++++++---
 2 files changed, 127 insertions(+), 11 deletions(-)
 create mode 100644 keras_hub/src/models/flux/flux_backbone_test.py

diff --git a/keras_hub/src/models/flux/flux_backbone_test.py b/keras_hub/src/models/flux/flux_backbone_test.py
new file mode 100644
index 0000000000..dfbf972942
--- /dev/null
+++ b/keras_hub/src/models/flux/flux_backbone_test.py
@@ -0,0 +1,88 @@
+import pytest
+from keras import ops
+
+from keras_hub.src.models.clip.clip_text_encoder import CLIPTextEncoder
+from keras_hub.src.models.flux.flux_model import FluxBackbone
+from keras_hub.src.models.vae.vae_backbone import VAEBackbone
+from keras_hub.src.tests.test_case import TestCase
+
+
+class FluxBackboneTest(TestCase):
+    def setUp(self):
+        vae = VAEBackbone(
+            [32, 32, 32, 32],
+            [1, 1, 1, 1],
+            [32, 32, 32, 32],
+            [1, 1, 1, 1],
+            # Use `mode` generate a deterministic output.
+            sampler_method="mode",
+            name="vae",
+        )
+        clip_l = CLIPTextEncoder(
+            20, 32, 32, 2, 2, 64, "quick_gelu", -2, name="clip_l"
+        )
+        self.init_kwargs = {
+            "input_channels": 256,
+            "hidden_size": 1024,
+            "mlp_ratio": 2.0,
+            "num_heads": 8,
+            "depth": 4,
+            "depth_single_blocks": 8,
+            "axes_dim": [16, 56, 56],
+            "theta": 10_000,
+            "use_bias": True,
+            "guidance_embed": True,
+            "image_shape": (32, 256),
+            "text_shape": (32, 256),
+            "image_ids_shape": (32, 3),
+            "text_ids_shape": (32, 3),
+            "timestep_shape": (128,),
+            "y_shape": (256,),
+            "guidance_shape": (128,),
+        }
+
+        self.pipeline_models = {
+            "vae": vae,
+            "clip_l": clip_l,
+        }
+
+        input_data = {
+            "image": ops.ones((1, 32, 256)),
+            "image_ids": ops.ones((1, 32, 3)),
+            "text": ops.ones((1, 32, 256)),
+            "text_ids": ops.ones((1, 32, 3)),
+            "y": ops.ones((1, 256)),
+            # Name is set but for some reason, it's overriden
+            "keras_tensor_8CLONE": ops.ones((32,)),
+            "keras_tensor_9CLONE": ops.ones((32,)),
+        }
+
+        self.input_data = [
+            input_data["image"],
+            input_data["image_ids"],
+            input_data["text"],
+            input_data["text_ids"],
+            input_data["y"],
+            input_data["keras_tensor_8CLONE"],
+            input_data["keras_tensor_9CLONE"],
+        ]
+
+    # backbone.predict() will complain about data cardinality.
+    # i.e. all data has a batch size of 1, but the
+    # timesteps and guidance are unbatched and the cardinality
+    # thus doesn't match.
+    def test_backbone_basics(self):
+        self.run_backbone_test(
+            cls=FluxBackbone,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+            expected_output_shape=[32, 32, 256],
+        )
+
+    @pytest.mark.large
+    def test_saved_model(self):
+        self.run_model_saving_test(
+            cls=FluxBackbone,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+        )
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 829bbcdd9c..3de30ccfec 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -72,18 +72,7 @@ def __init__(
         guidance_shape=(256,),
         **kwargs,
     ):
-        super().__init__()
 
-        if hidden_size % num_heads != 0:
-            raise ValueError(
-                f"Hidden size {hidden_size} must be divisible by num_heads {num_heads}"
-            )
-        pe_dim = hidden_size // num_heads
-
-        if sum(axes_dim) != pe_dim:
-            raise ValueError(
-                f"Got {axes_dim} but expected positional dim {pe_dim}"
-            )
         # === Layers ===
         self.positional_embedder = EmbedND(theta=theta, axes_dim=axes_dim)
         self.image_input_embedder = keras.layers.Dense(
@@ -131,6 +120,7 @@ def __init__(
         )
 
         # These should be unbatched. Is there a nicer way to do this in Keras?
+        # This also overrides the names above.
         timesteps_input = keras.ops.squeeze(timesteps_input, axis=0)
         guidance_input = keras.ops.squeeze(guidance_input, axis=0)
 
@@ -185,6 +175,44 @@ def __init__(
         self.output_channels = self.input_channels
         self.hidden_size = hidden_size
         self.num_heads = num_heads
+        self.image_shape = image_shape
+        self.text_shape = text_shape
+        self.image_ids_shape = image_ids_shape
+        self.text_ids_shape = text_ids_shape
+        self.y_shape = y_shape
+        self.timestep_shape = timestep_shape
+        self.guidance_shape = guidance_shape
+        self.mlp_ratio = mlp_ratio
+        self.depth = depth
+        self.depth_single_blocks = depth_single_blocks
+        self.axes_dim = axes_dim
+        self.theta = theta
+        self.use_bias = use_bias
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "input_channels": self.input_channels,
+                "hidden_size": self.hidden_size,
+                "mlp_ratio": self.mlp_ratio,
+                "num_heads": self.num_heads,
+                "depth": self.depth,
+                "depth_single_blocks": self.depth_single_blocks,
+                "axes_dim": self.axes_dim,
+                "theta": self.theta,
+                "use_bias": self.use_bias,
+                "guidance_embed": self.guidance_embed,
+                "image_shape": self.image_shape,
+                "text_shape": self.text_shape,
+                "image_ids_shape": self.image_ids_shape,
+                "text_ids_shape": self.text_ids_shape,
+                "timestep_shape": self.timestep_shape,
+                "guidance_shape": self.guidance_shape,
+                "y_shape": self.y_shape,
+            }
+        )
+        return config
 
     def encode_text_step(self, token_ids, negative_token_ids):
         clip_hidden_dim = self.clip_hidden_dim

From 7aa93a27473ed28f85c6b8521878bcff785affdf Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Wed, 16 Oct 2024 02:42:03 +0900
Subject: [PATCH 65/68] raise not implemented on encode, encode_text, etc.
 methods

---
 keras_hub/src/models/flux/flux_model.py | 41 ++-----------------------
 1 file changed, 2 insertions(+), 39 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 3de30ccfec..c6dc7cc61f 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -1,5 +1,4 @@
 import keras
-from keras import ops
 
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.backbone import Backbone
@@ -215,41 +214,10 @@ def get_config(self):
         return config
 
     def encode_text_step(self, token_ids, negative_token_ids):
-        clip_hidden_dim = self.clip_hidden_dim
-        t5_hidden_dim = self.t5_hidden_dim
+        raise NotImplementedError("Not implemented yet")
 
         def encode(token_ids):
-            clip_l_outputs = self.clip_l(token_ids["clip_l"], training=False)
-            clip_l_projection = self.clip_l_projection(
-                clip_l_outputs["sequence_output"],
-                token_ids["clip_l"],
-                training=False,
-            )
-
-            embeddings = ops.pad(
-                clip_l_outputs,
-                [[0, 0], [0, 0], [0, t5_hidden_dim - clip_hidden_dim]],
-            )
-            if self.t5 is not None:
-                t5_outputs = self.t5(token_ids["t5"], training=False)
-                embeddings = ops.concatenate([embeddings, t5_outputs], axis=-2)
-            else:
-                padded_size = self.clip_l.max_sequence_length
-                embeddings = ops.pad(
-                    embeddings, [[0, 0], [0, padded_size], [0, 0]]
-                )
-            return embeddings, clip_l_projection
-
-        positive_embeddings, positive_pooled_embeddings = encode(token_ids)
-        negative_embeddings, negative_pooled_embeddings = encode(
-            negative_token_ids
-        )
-        return (
-            positive_embeddings,
-            negative_embeddings,
-            positive_pooled_embeddings,
-            negative_pooled_embeddings,
-        )
+            raise NotImplementedError("Not implemented yet")
 
     def encode_image_step(self, images):
         raise NotImplementedError("Not implemented yet")
@@ -259,11 +227,6 @@ def add_noise_step(self, latents, noises, step, num_steps):
 
     def denoise_step(
         self,
-        latents,
-        embeddings,
-        step,
-        num_steps,
-        guidance_scale,
     ):
         raise NotImplementedError("Not implemented yet")
 

From b05c94b56d9fcf2c994efe036ddab704b9484ccd Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Wed, 16 Oct 2024 03:08:04 +0900
Subject: [PATCH 66/68] styling

---
 .../flux_text_to_image_preprocessor_test.py   | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 keras_hub/src/models/flux/flux_text_to_image_preprocessor_test.py

diff --git a/keras_hub/src/models/flux/flux_text_to_image_preprocessor_test.py b/keras_hub/src/models/flux/flux_text_to_image_preprocessor_test.py
new file mode 100644
index 0000000000..d9a3a9d0a8
--- /dev/null
+++ b/keras_hub/src/models/flux/flux_text_to_image_preprocessor_test.py
@@ -0,0 +1,51 @@
+import pytest
+
+from keras_hub.src.models.clip.clip_preprocessor import CLIPPreprocessor
+from keras_hub.src.models.clip.clip_tokenizer import CLIPTokenizer
+from keras_hub.src.models.flux.flux_text_to_image_preprocessor import (
+    FluxTextToImagePreprocessor,
+)
+from keras_hub.src.tests.test_case import TestCase
+
+
+class FluxTextToImagePreprocessorTest(TestCase):
+    def setUp(self):
+        vocab = ["air", "plane</w>", "port</w>"]
+        vocab += ["<|endoftext|>", "<|startoftext|>"]
+        vocab = dict([(token, i) for i, token in enumerate(vocab)])
+        merges = ["a i", "p l", "n e</w>", "p o", "r t</w>", "ai r", "pl a"]
+        merges += ["po rt</w>", "pla ne</w>"]
+        clip_l_tokenizer = CLIPTokenizer(
+            vocabulary=vocab, merges=merges, pad_with_end_token=True
+        )
+        clip_l_preprocessor = CLIPPreprocessor(
+            clip_l_tokenizer, sequence_length=8
+        )
+        self.init_kwargs = {
+            "clip_l_preprocessor": clip_l_preprocessor,
+        }
+        self.input_data = ["airplane"]
+
+    def test_preprocessor_basics(self):
+        pytest.skip(
+            reason="TODO: enable after preprocessor flow is figured out"
+        )
+        self.run_preprocessing_layer_test(
+            cls=FluxTextToImagePreprocessor,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+            expected_output=(
+                {
+                    "token_ids": [[1, 4, 9, 5, 7, 2, 0, 0]],
+                    "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]],
+                },
+                [[4, 9, 5, 7, 2, 0, 0, 0]],  # Labels shifted.
+                [[1, 1, 1, 1, 1, 0, 0, 0]],  # Zero out unlabeled examples.
+            ),
+        )
+
+    def test_generate_preprocess(self):
+        preprocessor = FluxTextToImagePreprocessor(**self.init_kwargs)
+        x = preprocessor.generate_preprocess(self.input_data)
+        self.assertIn("clip_l", x)
+        self.assertAllEqual(x["clip_l"][0], [4, 0, 1, 3, 3, 3, 3, 3])

From 94f9ffb1b64892966f4e522e4f8809406d420739 Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Thu, 17 Oct 2024 01:35:34 +0900
Subject: [PATCH 67/68] fix shape hack with a cleaner alternative

---
 .../src/models/flux/flux_backbone_test.py     | 25 +++----------------
 keras_hub/src/models/flux/flux_model.py       | 17 ++-----------
 2 files changed, 6 insertions(+), 36 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_backbone_test.py b/keras_hub/src/models/flux/flux_backbone_test.py
index dfbf972942..38d5d0deb7 100644
--- a/keras_hub/src/models/flux/flux_backbone_test.py
+++ b/keras_hub/src/models/flux/flux_backbone_test.py
@@ -36,9 +36,7 @@ def setUp(self):
             "text_shape": (32, 256),
             "image_ids_shape": (32, 3),
             "text_ids_shape": (32, 3),
-            "timestep_shape": (128,),
             "y_shape": (256,),
-            "guidance_shape": (128,),
         }
 
         self.pipeline_models = {
@@ -46,37 +44,22 @@ def setUp(self):
             "clip_l": clip_l,
         }
 
-        input_data = {
+        self.input_data = {
             "image": ops.ones((1, 32, 256)),
             "image_ids": ops.ones((1, 32, 3)),
             "text": ops.ones((1, 32, 256)),
             "text_ids": ops.ones((1, 32, 3)),
             "y": ops.ones((1, 256)),
-            # Name is set but for some reason, it's overriden
-            "keras_tensor_8CLONE": ops.ones((32,)),
-            "keras_tensor_9CLONE": ops.ones((32,)),
+            "timestepsCLONE": ops.ones((1)),
+            "guidanceCLONE": ops.ones((1)),
         }
 
-        self.input_data = [
-            input_data["image"],
-            input_data["image_ids"],
-            input_data["text"],
-            input_data["text_ids"],
-            input_data["y"],
-            input_data["keras_tensor_8CLONE"],
-            input_data["keras_tensor_9CLONE"],
-        ]
-
-    # backbone.predict() will complain about data cardinality.
-    # i.e. all data has a batch size of 1, but the
-    # timesteps and guidance are unbatched and the cardinality
-    # thus doesn't match.
     def test_backbone_basics(self):
         self.run_backbone_test(
             cls=FluxBackbone,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output_shape=[32, 32, 256],
+            expected_output_shape=[1, 32, 256],
         )
 
     @pytest.mark.large
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index c6dc7cc61f..8e9fc9bd52 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -67,8 +67,6 @@ def __init__(
         image_ids_shape=(None, 768, 3072),
         text_ids_shape=(None, 768, 3072),
         y_shape=(None, 128),
-        timestep_shape=(256,),
-        guidance_shape=(256,),
         **kwargs,
     ):
 
@@ -111,17 +109,8 @@ def __init__(
         text_input = keras.Input(shape=text_shape, name="text")
         text_ids = keras.Input(shape=text_ids_shape, name="text_ids")
         y = keras.Input(shape=y_shape, name="y")
-        timesteps_input = keras.Input(
-            shape=timestep_shape, batch_size=1, name="timesteps"
-        )
-        guidance_input = keras.Input(
-            shape=guidance_shape, batch_size=1, name="guidance"
-        )
-
-        # These should be unbatched. Is there a nicer way to do this in Keras?
-        # This also overrides the names above.
-        timesteps_input = keras.ops.squeeze(timesteps_input, axis=0)
-        guidance_input = keras.ops.squeeze(guidance_input, axis=0)
+        timesteps_input = keras.Input(shape=(), name="timesteps")
+        guidance_input = keras.Input(shape=(), name="guidance")
 
         # running on sequences image
         image = self.image_input_embedder(image_input)
@@ -179,8 +168,6 @@ def __init__(
         self.image_ids_shape = image_ids_shape
         self.text_ids_shape = text_ids_shape
         self.y_shape = y_shape
-        self.timestep_shape = timestep_shape
-        self.guidance_shape = guidance_shape
         self.mlp_ratio = mlp_ratio
         self.depth = depth
         self.depth_single_blocks = depth_single_blocks

From adeb842bd9ca9f0f702299b1e169ce1b43f4438b Mon Sep 17 00:00:00 2001
From: David Landup <david.landup.0@gmail.com>
Date: Thu, 17 Oct 2024 01:42:55 +0900
Subject: [PATCH 68/68] remove unused attributes, fix tests

---
 .../src/models/flux/flux_backbone_test.py     |  6 ++++--
 keras_hub/src/models/flux/flux_model.py       | 20 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/keras_hub/src/models/flux/flux_backbone_test.py b/keras_hub/src/models/flux/flux_backbone_test.py
index 38d5d0deb7..766fff8ee1 100644
--- a/keras_hub/src/models/flux/flux_backbone_test.py
+++ b/keras_hub/src/models/flux/flux_backbone_test.py
@@ -50,8 +50,8 @@ def setUp(self):
             "text": ops.ones((1, 32, 256)),
             "text_ids": ops.ones((1, 32, 3)),
             "y": ops.ones((1, 256)),
-            "timestepsCLONE": ops.ones((1)),
-            "guidanceCLONE": ops.ones((1)),
+            "timesteps": ops.ones((1)),
+            "guidance": ops.ones((1)),
         }
 
     def test_backbone_basics(self):
@@ -60,6 +60,8 @@ def test_backbone_basics(self):
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
             expected_output_shape=[1, 32, 256],
+            run_mixed_precision_check=False,
+            run_quantization_check=False,
         )
 
     @pytest.mark.large
diff --git a/keras_hub/src/models/flux/flux_model.py b/keras_hub/src/models/flux/flux_model.py
index 8e9fc9bd52..447964d9e1 100644
--- a/keras_hub/src/models/flux/flux_model.py
+++ b/keras_hub/src/models/flux/flux_model.py
@@ -145,15 +145,15 @@ def __init__(
         )  # (N, T, patch_size ** 2 * output_channels)
 
         super().__init__(
-            inputs=[
-                image_input,
-                image_ids,
-                text_input,
-                text_ids,
-                y,
-                timesteps_input,
-                guidance_input,
-            ],
+            inputs={
+                "image": image_input,
+                "image_ids": image_ids,
+                "text": text_input,
+                "text_ids": text_ids,
+                "y": y,
+                "timesteps": timesteps_input,
+                "guidance": guidance_input,
+            },
             outputs=image,
             **kwargs,
         )
@@ -193,8 +193,6 @@ def get_config(self):
                 "text_shape": self.text_shape,
                 "image_ids_shape": self.image_ids_shape,
                 "text_ids_shape": self.text_ids_shape,
-                "timestep_shape": self.timestep_shape,
-                "guidance_shape": self.guidance_shape,
                 "y_shape": self.y_shape,
             }
         )