From 1b4ffbce530b534b4205ed9bcf4a5f140b866862 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Tue, 28 May 2024 19:33:43 -0400
Subject: [PATCH 01/35] snapshot

---
 library/src/auxiliary/bdsqr_host.hpp | 2077 ++++++++++++++++++++++++++
 1 file changed, 2077 insertions(+)
 create mode 100644 library/src/auxiliary/bdsqr_host.hpp
diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp
new file mode 100644
index 000000000..d40be74ab
--- /dev/null
+++ b/library/src/auxiliary/bdsqr_host.hpp
@@ -0,0 +1,2077 @@
+#define USE_VT
+
+/****************************************************************************
+ * Derived from the BSD3-licensed
+ * LAPACK routine (version 3.7.1) --
+ *     Univ. of Tennessee, Univ. of California Berkeley,
+ *     Univ. of Colorado Denver and NAG Ltd..
+ *     June 2017
+ * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include <assert.h>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <stdio.h>
+
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+#ifndef HIP_CHECK
+#define HIP_CHECK(fcn)                                                         \
+  {                                                                            \
+    hipError_t const istat = (fcn);                                            \
+    assert(istat == hipSuccess);                                               \
+  }
+#endif
+
+
+template <typename S, typename T, typename I>
+__global__ static void lasr_kernel(char const side, char const pivot,
+                                        char const direct, I const m, I const n,
+                                        S const *const c_, S const *const s_,
+                                        T *const A_, I const lda) {
+
+  const auto nblocks = hipGridDim_x;
+  const auto nthreads_per_block = hipBlockDim_x;
+  const auto nthreads = nblocks * nthreads_per_block;
+  const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  const auto i_inc = nthreads;
+  const auto ij_nb = nthreads;
+  const auto ij_start = tid;
+
+  auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
+  auto min = [](auto x, auto y) { return ((x < y) ? x : y); };
+
+  auto indx2f = [](auto i, auto j, auto lda) -> int64_t {
+    assert((1 <= i));
+    assert((1 <= lda));
+    assert((1 <= j));
+
+    return ((i - 1) + (j - 1) * int64_t(lda));
+  };
+
+  auto indx1f = [](auto i) -> int64_t {
+    assert((1 <= i));
+    return (i - int64_t(1));
+  };
+
+  auto c = [&](auto i) -> const S & { return (c_[indx1f(i)]); };
+  auto s = [&](auto i) -> const S & { return (s_[indx1f(i)]); };
+  auto A = [&](auto i, auto j) -> T & { return (A_[indx2f(i, j, lda)]); };
+
+  const S one = 1;
+  const S zero = 0;
+
+  // ----------------
+  // check arguments
+  // ----------------
+
+  const bool is_side_Left = (side == 'L') || (side == 'l');
+  const bool is_side_Right = (side == 'R') || (side == 'r');
+
+  const bool is_pivot_Variable = (pivot == 'V') || (pivot == 'v');
+  const bool is_pivot_Bottom = (pivot == 'B') || (pivot == 'b');
+  const bool is_pivot_Top = (pivot == 'T') || (pivot == 't');
+
+  const bool is_direct_Forward = (direct == 'F') || (direct == 'f');
+  const bool is_direct_Backward = (direct == 'B') || (direct == 'b');
+
+  {
+    const bool isok_side = is_side_Left || is_side_Right;
+    const bool isok_pivot =
+        is_pivot_Variable || is_pivot_Bottom || is_pivot_Top;
+    const bool isok_direct = is_direct_Forward || is_direct_Backward;
+
+    const I info = (!isok_side)        ? 1
+                   : (!isok_pivot)     ? 2
+                   : (!isok_direct)    ? 3
+                   : (m < 0)           ? 4
+                   : (n < 0)           ? 5
+                   : (c_ == nullptr)   ? 6
+                   : (s_ == nullptr)   ? 7
+                   : (A_ == nullptr)   ? 8
+                   : (lda < max(1, m)) ? 9
+                                       : 0;
+    if (info != 0)
+      return;
+  };
+
+  {
+    const bool has_work = (m >= 1) && (n >= 1);
+    if (!has_work) {
+      return;
+    };
+  };
+
+  if (is_side_Left && is_pivot_Variable && is_direct_Forward) {
+    //  -----------------------------
+    //  A := P*A
+    //  Variable pivot, the plane (k,k+1)
+    //  P = P(z-1) * ... * P(2) * P(1)
+    //  -----------------------------
+    {
+
+      for (I j = 1; j <= (m - 1); j++) {
+        const auto ctemp = c(j);
+        const auto stemp = s(j);
+        if ((ctemp != one) || (stemp != zero)) {
+          for (I i = 1 + tid; i <= n; i += i_inc) {
+            const auto temp = A(j + 1, i);
+            A(j + 1, i) = ctemp * temp - stemp * A(j, i);
+            A(j, i) = stemp * temp + ctemp * A(j, i);
+          }
+        };
+      };
+    };
+
+    return;
+  };
+
+  if (is_side_Left && is_pivot_Variable && is_direct_Backward) {
+    //  -----------------------------
+    //  A := P*A
+    //  Variable pivot, the plane (k,k+1)
+    //  P = P(1)*P(2)*...*P(z-1)
+    //  -----------------------------
+
+    auto const jend = (m - 1);
+    auto const jstart = 1;
+    auto const istart = 1;
+    auto const iend = n;
+
+    for (I j = jend; j >= jstart; j--) {
+      const auto ctemp = c(j);
+      const auto stemp = s(j);
+      if ((ctemp != one) || (stemp != zero)) {
+        for (I i = istart + tid; i <= iend; i += i_inc) {
+          const auto temp = A(j + 1, i);
+          A(j + 1, i) = ctemp * temp - stemp * A(j, i);
+          A(j, i) = stemp * temp + ctemp * A(j, i);
+        };
+      };
+    };
+
+    return;
+  };
+
+  if (is_side_Left && is_pivot_Top && is_direct_Forward) {
+    //  -----------------------------
+    //  A := P*A
+    //  Top pivot, the plane (1,k+1)
+    //  P = P(z-1) * ... * P(2) * P(1)
+    //  -----------------------------
+    {
+      for (I j = 2; j <= m; j++) {
+        const auto ctemp = c(j - 1);
+        const auto stemp = s(j - 1);
+        for (I i = 1 + tid; i <= n; i += i_inc) {
+          const auto temp = A(j, i);
+          A(j, i) = ctemp * temp - stemp * A(1, i);
+          A(1, i) = stemp * temp + ctemp * A(1, i);
+        };
+      };
+    };
+
+    return;
+  };
+
+  if (is_side_Left && is_pivot_Top && is_direct_Backward) {
+    //  -----------------------------
+    //  A := P*A
+    //  Top pivot, the plane (1,k+1)
+    //  P = P(1)*P(2)*...*P(z-1)
+    //  -----------------------------
+    {
+
+      auto const jend = m;
+      auto const jstart = 2;
+      auto const istart = 1;
+      auto const iend = n;
+
+      for (I j = jend; j >= jstart; j--) {
+        const auto ctemp = c(j - 1);
+        const auto stemp = s(j - 1);
+        if ((ctemp != one) || (stemp != zero)) {
+          for (I i = istart + tid; i <= iend; i += i_inc) {
+            const auto temp = A(j, i);
+
+            A(j, i) = ctemp * temp - stemp * A(1, i);
+            A(1, i) = stemp * temp + ctemp * A(1, i);
+          };
+        };
+      };
+    }
+
+    return;
+  };
+
+  if (is_side_Left && is_pivot_Bottom && is_direct_Forward) {
+    //  -----------------------------
+    //  A := P*A
+    //  Bottom pivot, the plane (k,z)
+    //  P = P(z-1) * ... * P(2) * P(1)
+    //  -----------------------------
+    {
+
+      auto const jstart = 1;
+      auto const jend = (m - 1);
+      auto const istart = 1;
+      auto const iend = n;
+
+      for (I j = jstart; j <= jend; j++) {
+        const auto ctemp = c(j);
+        const auto stemp = s(j);
+        if ((ctemp != one) || (stemp != zero)) {
+          for (I i = istart + tid; i <= iend; i += i_inc) {
+            const auto temp = A(j, i);
+            A(j, i) = stemp * A(m, i) + ctemp * temp;
+            A(m, i) = ctemp * A(m, i) - stemp * temp;
+          };
+        };
+      };
+    }
+
+    return;
+  };
+
+  if (is_side_Left && is_pivot_Bottom && is_direct_Backward) {
+    //  -----------------------------
+    //  A := P*A
+    //  Bottom pivot, the plane (k,z)
+    //  P = P(1)*P(2)*...*P(z-1)
+    //  -----------------------------
+    {
+
+      auto const jend = (m - 1);
+      auto const jstart = 1;
+      auto const istart = 1;
+      auto const iend = n;
+
+      for (I j = jend; j >= jstart; j--) {
+        const auto ctemp = c(j);
+        const auto stemp = s(j);
+        if ((ctemp != one) || (stemp != zero)) {
+
+          for (I i = istart + tid; i <= iend; i += i_inc) {
+            const auto temp = A(j, i);
+            A(j, i) = stemp * A(m, i) + ctemp * temp;
+            A(m, i) = ctemp * A(m, i) - stemp * temp;
+          };
+        };
+      };
+    }
+
+    return;
+  };
+
+  if (is_side_Right && is_pivot_Variable && is_direct_Forward) {
+    //  -----------------------------
+    //  A := A*P**T
+    //  Variable pivot, the plane (k,k+1)
+    //  P = P(z-1) * ... * P(2) * P(1)
+    //  -----------------------------
+
+    {
+
+      auto const jstart = 1;
+      auto const jend = (n - 1);
+      auto const istart = 1;
+      auto const iend = m;
+
+      for (I j = jstart; j <= jend; j++) {
+        const auto ctemp = c(j);
+        const auto stemp = s(j);
+        if ((ctemp != one) || (stemp != zero)) {
+          for (I i = istart + tid; i <= iend; i += i_inc) {
+            const auto temp = A(i, j + 1);
+            A(i, j + 1) = ctemp * temp - stemp * A(i, j);
+            A(i, j) = stemp * temp + ctemp * A(i, j);
+          };
+        };
+      };
+    }
+
+    return;
+  };
+
+  if (is_side_Right && is_pivot_Variable && is_direct_Backward) {
+    //  -----------------------------
+    //  A := A*P**T
+    //  Variable pivot, the plane (k,k+1)
+    //  P = P(1)*P(2)*...*P(z-1)
+    //  -----------------------------
+
+    {
+
+      auto const jend = (n - 1);
+      auto const jstart = 1;
+      auto const istart = 1;
+      auto const iend = m;
+
+      for (I j = jend; j >= jstart; j--) {
+        const auto ctemp = c(j);
+        const auto stemp = s(j);
+        if ((ctemp != one) || (stemp != zero)) {
+
+          for (I i = istart + tid; i <= iend; i += i_inc) {
+            const auto temp = A(i, j + 1);
+            A(i, j + 1) = ctemp * temp - stemp * A(i, j);
+            A(i, j) = stemp * temp + ctemp * A(i, j);
+          };
+        };
+      };
+    }
+    return;
+  };
+
+  if (is_side_Right && is_pivot_Top && is_direct_Forward) {
+    //  -----------------------------
+    //  A := A*P**T
+    //  Top pivot, the plane (1,k+1)
+    //  P = P(z-1) * ... * P(2) * P(1)
+    //  -----------------------------
+
+    {
+
+      auto const jstart = 2;
+      auto const jend = n;
+      auto const istart = 1;
+      auto const iend = m;
+
+      for (I j = jstart; j <= jend; j++) {
+        const auto ctemp = c(j - 1);
+        const auto stemp = s(j - 1);
+        if ((ctemp != one) || (stemp != zero)) {
+          for (I i = istart + tid; i <= iend; i += i_inc) {
+            const auto temp = A(i, j);
+
+            A(i, j) = ctemp * temp - stemp * A(i, 1);
+            A(i, 1) = stemp * temp + ctemp * A(i, 1);
+          };
+        };
+      };
+    }
+
+    return;
+  };
+
+  if (is_side_Right && is_pivot_Top && is_direct_Backward) {
+    //  -----------------------------
+    //  A := A*P**T
+    //  Top pivot, the plane (1,k+1)
+    //  P = P(1)*P(2)*...*P(z-1)
+    //  -----------------------------
+
+    {
+
+      auto const jend = n;
+      auto const jstart = 2;
+      auto const istart = 1;
+      auto const iend = m;
+
+      for (I j = jend; j >= jstart; j--) {
+        const auto ctemp = c(j - 1);
+        const auto stemp = s(j - 1);
+        if ((ctemp != one) || (stemp != zero)) {
+          for (I i = istart + tid; i <= iend; i += i_inc) {
+            const auto temp = A(i, j);
+
+            A(i, j) = ctemp * temp - stemp * A(i, 1);
+            A(i, 1) = stemp * temp + ctemp * A(i, 1);
+          };
+        };
+      };
+    }
+
+    return;
+  };
+
+  if (is_side_Right && is_pivot_Bottom && is_direct_Forward) {
+    //  -----------------------------
+    //  A := A*P**T
+    //  Bottom pivot, the plane (k,z)
+    //  P = P(z-1) * ... * P(2) * P(1)
+    //  -----------------------------
+
+    {
+
+      auto const jstart = 1;
+      auto const jend = (n - 1);
+      auto const istart = 1;
+      auto const iend = m;
+
+      for (I j = jstart; j <= jend; j++) {
+        const auto ctemp = c(j);
+        const auto stemp = s(j);
+        if ((ctemp != one) || (stemp != zero)) {
+          for (I i = istart + tid; i <= iend; i += i_inc) {
+            const auto temp = A(i, j);
+
+            A(i, j) = stemp * A(i, n) + ctemp * temp;
+            A(i, n) = ctemp * A(i, n) - stemp * temp;
+          };
+        };
+      };
+    }
+
+    return;
+  };
+
+  if (is_side_Right && is_pivot_Bottom && is_direct_Backward) {
+    //  -----------------------------
+    //  A := A*P**T
+    //  Bottom pivot, the plane (k,z)
+    //  P = P(1)*P(2)*...*P(z-1)
+    //  -----------------------------
+
+    {
+
+      auto const jend = (n - 1);
+      auto const jstart = 1;
+      auto const istart = 1;
+      auto const iend = m;
+
+      for (I j = jend; j >= jstart; j--) {
+        const auto ctemp = c(j);
+        const auto stemp = s(j);
+        if ((ctemp != one) || (stemp != zero)) {
+          for (I i = istart + tid; i <= iend; i += i_inc) {
+            const auto temp = A(i, j);
+            A(i, j) = stemp * A(i, n) + ctemp * temp;
+            A(i, n) = ctemp * A(i, n) - stemp * temp;
+          };
+        };
+      };
+    }
+
+    return;
+  };
+
+  return;
+}
+template <typename S, typename T, typename I>
+static void lasr_template_gpu(char const side, char const pivot,
+                              char const direct, I const m, I const n,
+                              S const *const c_, S const *const s_, T *const A_,
+                              I const lda, hipStream_t stream = 0) {
+
+  auto const nthreads = 2 * warpSize;
+
+  bool const is_left_side = (side == 'L') || (side == 'l');
+  auto const mn = (is_left_side) ? n : m;
+
+  auto const nblocks = (mn - 1) / nthreads + 1;
+  hipLaunchKernelGGL((lasr_kernel<S, T, I>), dim3(nblocks, 1, 1),
+                     dim3(nthreads, 1, 1), 0, stream, side, pivot, direct, m, n,
+                     c_, s_, A_, lda);
+}
+
+template <typename S, typename T, typename I>
+__global__ static void rot_kernel(I const n, T *const x, I const incx,
+                                  T *const y, I const incy, S const c,
+                                  S const s) {
+  if (n <= 0)
+    return;
+
+  I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  I const i_inc = hipBlockDim_x * hipGridDim_x;
+
+  if ((incx == 1) && (incy == 1)) {
+    // ------------
+    // special case
+    // ------------
+    for (I i = i_start; i < n; i += i_inc) {
+      auto const temp = c * x[i] + s * y[i];
+      y[i] = c * y[i] - s * x[i];
+      x[i] = temp;
+    }
+  } else {
+    // ---------------------------
+    // code for unequal increments
+    // ---------------------------
+
+    for (auto i = i_start; i < n; i += i_inc) {
+      auto const ix = 0 + i * static_cast<int64_t>(incx);
+      auto const iy = 0 + i * static_cast<int64_t>(incy);
+      auto const temp = c * x[ix] + s * y[iy];
+      y[iy] = c * y[iy] - s * x[ix];
+      x[ix] = temp;
+    }
+  }
+}
+
+template <typename S, typename T, typename I>
+static void rot_template(I const n, T *x, I const incx, T *y, I const incy,
+                         S const c, S const s, hipStream_t stream) {
+  auto nthreads = warpSize * 2;
+  auto nblocks = (n - 1) / nthreads + 1;
+
+  hipLaunchKernelGGL((rot_kernel<S, T, I>), dim3(nblocks, 1, 1),
+                     dim3(nthreads, 1, 1), 0, stream, n, x, incx, y, incy, c,
+                     s);
+}
+
+template <typename S, typename T, typename I>
+__global__ static void scal_kernel(I const n, S const da, T *const x,
+                                   I const incx) {
+
+  if (n <= 0)
+    return;
+
+  I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  I const i_inc = hipBlockDim_x * hipGridDim_x;
+
+  S const zero = 0;
+  bool const is_da_zero = (da == zero);
+  if (incx == 1) {
+    for (I i = i_start; i < n; i += i_inc) {
+      x[i] = (is_da_zero) ? zero : da * x[i];
+    }
+  } else {
+    // ---------------------------
+    // code for non-unit increments
+    // ---------------------------
+
+    for (I i = i_start; i < n; i += i_inc) {
+      auto const ix = 0 + i * static_cast<int64_t>(incx);
+      x[ix] = (is_da_zero) ? zero : da * x[ix];
+    }
+  }
+}
+
+template <typename S, typename T, typename I>
+static void scal_template(I const n, S const da, T *const x, I const incx,
+                          hipStream_t stream) {
+  auto nthreads = warpSize * 2;
+  auto nblocks = (n - 1) / nthreads + 1;
+
+  hipLaunchKernelGGL((scal_kernel<S, T, I>), dim3(nblocks, 1, 1),
+                     dim3(nthreads, 1, 1), 0, stream, n, da, x, incx);
+}
+
+template <typename S, typename T, typename I>
+__global__ static void swap_kernel(I const n, T *const x, I const incx,
+                                   T *const y, I const incy) {
+  if (n <= 0)
+    return;
+
+  I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+  I const i_inc = hipBlockDim_x * hipGridDim_x;
+
+  if ((incx == 1) && (incy == 1)) {
+    // ------------
+    // special case
+    // ------------
+    for (I i = i_start; i < n; i += i_inc) {
+      auto const temp = y[i];
+      y[i] = x[i];
+      x[i] = temp;
+    }
+  } else {
+    // ---------------------------
+    // code for unequal increments
+    // ---------------------------
+
+    for (I i = i_start; i < n; i += i_inc) {
+      auto const ix = 0 + i * static_cast<int64_t>(incx);
+      auto const iy = 0 + i * static_cast<int64_t>(incy);
+
+      auto const temp = y[iy];
+      y[iy] = x[ix];
+      x[ix] = temp;
+    }
+  }
+}
+
+template <typename S, typename T, typename I>
+static void swap_template(I const n, T *x, I const incx, T *y, I const incy,
+                          hipStream_t stream) {
+  auto nthreads = warpSize * 2;
+  auto nblocks = (n - 1) / nthreads + 1;
+
+  hipLaunchKernelGGL((swap_kernel<S, T, I>), dim3(nblocks, 1, 1),
+                     dim3(nthreads, 1, 1), 0, stream, n, x, incx, y, incy);
+}
+
+extern "C" {
+
+double dlamch_(char *cmach);
+float slamch_(char *cmach);
+
+void zswap_(int *n, std::complex<double> *zx, int *incx,
+            std::complex<double> *zy, int *incy);
+
+void cswap_(int *n, std::complex<float> *zx, int *incx, std::complex<float> *zy,
+            int *incy);
+
+void dswap_(int *n, double *zx, int *incx, double *zy, int *incy);
+
+void sswap_(int *n, float *zx, int *incx, float *zy, int *incy);
+
+void dlasq1_(int *n, double *D_, double *E_, double *rwork_, int *info_arg);
+void slasq1_(int *n, float *D_, float *E_, float *rwork_, int *info_arg);
+
+void zlasr_(char *side, char *pivot, char *direct, int *m, int *n, double *c,
+            double *s, std::complex<double> *A, int *lda);
+void clasr_(char *side, char *pivot, char *direct, int *m, int *n, float *c,
+            float *s, std::complex<float> *A, int *lda);
+void slasr_(char *side, char *pivot, char *direct, int *m, int *n, float *c,
+            float *s, float *A, int *lda);
+void dlasr_(char *side, char *pivot, char *direct, int *m, int *n, double *c,
+            double *s, double *A, int *lda);
+
+void dlasv2_(double *f, double *g, double *h, double *ssmin, double *ssmax,
+             double *snr, double *csr, double *snl, double *csl);
+void slasv2_(float *f, float *g, float *h, float *ssmin, float *ssmax,
+             float *snr, float *csr, float *snl, float *csl);
+
+void zdrot_(int *n, std::complex<double> *zx, int *incx,
+            std::complex<double> *zy, int *incy, double *c, double *s);
+
+void csrot_(int *n, std::complex<float> *zx, int *incx, std::complex<float> *zy,
+            int *incy, float *c, float *s);
+
+void drot_(int *n, double *dx, int *incx, double *dy, int *incy, double *c,
+           double *s);
+
+void srot_(int *n, float *dx, int *incx, float *dy, int *incy, float *c,
+           float *s);
+
+void zdscal_(int *n, double *da, std::complex<double> *zx, int *incx);
+void csscal_(int *n, float *da, std::complex<float> *zx, int *incx);
+void dscal_(int *n, double *da, double *zx, int *incx);
+void sscal_(int *n, float *da, float *zx, int *incx);
+
+void dlartg_(double *f, double *g, double *c, double *s, double *r);
+void slartg_(float *f, float *g, float *c, float *s, float *r);
+
+void dlas2_(double *f, double *g, double *h, double *ssmin, double *ssmax);
+void slas2_(float *f, float *g, float *h, float *ssmin, float *ssmax);
+};
+
+static void call_lamch(char &cmach_arg, double &eps) {
+  char cmach = cmach_arg;
+  eps = dlamch_(&cmach);
+}
+
+static void call_lamch(char &cmach_arg, float &eps) {
+  char cmach = cmach_arg;
+  eps = slamch_(&cmach);
+}
+
+static void call_swap(int &n, std::complex<double> &zx, int &incx,
+                      std::complex<double> &zy, int &incy) {
+  zswap_(&n, &zx, &incx, &zy, &incy);
+}
+
+static void call_swap(int &n, std::complex<float> &zx, int &incx,
+                      std::complex<float> &zy, int &incy) {
+  cswap_(&n, &zx, &incx, &zy, &incy);
+}
+
+static void call_swap(int &n, float &zx, int &incx, float &zy, int &incy) {
+  sswap_(&n, &zx, &incx, &zy, &incy);
+}
+
+static void call_swap(int &n, double &zx, int &incx, double &zy, int &incy) {
+  dswap_(&n, &zx, &incx, &zy, &incy);
+}
+
+static void call_las2(double &f, double &g, double &h, double &ssmin,
+                      double &ssmax) {
+  dlas2_(&f, &g, &h, &ssmin, &ssmax);
+}
+
+static void call_las2(float &f, float &g, float &h, float &ssmin,
+                      float &ssmax) {
+  slas2_(&f, &g, &h, &ssmin, &ssmax);
+}
+
+static void call_lartg(double &f, double &g, double &c, double &s, double &r) {
+  dlartg_(&f, &g, &c, &s, &r);
+}
+
+static void call_lartg(float &f, float &g, float &c, float &s, float &r) {
+  slartg_(&f, &g, &c, &s, &r);
+}
+
+static void call_scal(int &n, double &da, std::complex<double> &zx, int &incx) {
+  zdscal_(&n, &da, &zx, &incx);
+}
+
+static void call_scal(int &n, float &da, std::complex<float> &zx, int &incx) {
+  csscal_(&n, &da, &zx, &incx);
+}
+
+static void call_scal(int &n, double &da, double &zx, int &incx) {
+  dscal_(&n, &da, &zx, &incx);
+}
+
+static void call_scal(int &n, float &da, float &zx, int &incx) {
+  sscal_(&n, &da, &zx, &incx);
+}
+
+static void call_rot(int &n, std::complex<double> &zx, int &incx,
+                     std::complex<double> &zy, int &incy, double &c,
+                     double &s) {
+  zdrot_(&n, &zx, &incx, &zy, &incy, &c, &s);
+}
+
+static void call_rot(int &n, std::complex<float> &zx, int &incx,
+                     std::complex<float> &zy, int &incy, float &c, float &s) {
+  csrot_(&n, &zx, &incx, &zy, &incy, &c, &s);
+}
+
+static void call_rot(int &n, double &dx, int &incx, double &dy, int &incy,
+                     double &c, double &s) {
+  drot_(&n, &dx, &incx, &dy, &incy, &c, &s);
+}
+
+static void call_rot(int &n, float &dx, int &incx, float &dy, int &incy,
+                     float &c, float &s) {
+  srot_(&n, &dx, &incx, &dy, &incy, &c, &s);
+}
+
+static void call_lasv2(double &f, double &g, double &h, double &ssmin,
+                       double &ssmax, double &snr, double &csr, double &snl,
+                       double &csl) {
+  dlasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl);
+}
+
+static void call_lasv2(float &f, float &g, float &h, float &ssmin, float &ssmax,
+                       float &snr, float &csr, float &snl, float &csl) {
+  slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl);
+}
+
+static void call_lasq1(int &n, double &D_, double &E_, double &rwork_,
+                       int &info_arg) {
+  dlasq1_(&n, &D_, &E_, &rwork_, &info_arg);
+};
+
+static void call_lasq1(int &n, float &D_, float &E_, float &rwork_,
+                       int &info_arg) {
+  slasq1_(&n, &D_, &E_, &rwork_, &info_arg);
+};
+
+static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n,
+                      double &c, double &s, std::complex<double> &A, int &lda) {
+  zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
+};
+
+static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n,
+                      float &c, float &s, std::complex<float> &A, int &lda) {
+  clasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
+};
+
+static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n,
+                      float &c, float &s, float &A, int &lda) {
+  slasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
+};
+
+static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n,
+                      double &c, double &s, double &A, int &lda) {
+  dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
+};
+
+template <typename S, typename T, typename I>
+static void bdsqr_single_template(char uplo, I n, 
+#ifdef USE_VT
+		I ncvt, 
+#else
+		I ncv,
+#endif
+		I nru, I ncc, T *d_,
+                                  T *e_, 
+#ifdef USE_VT
+				  T *vt_, I ldvt, 
+#else
+				  T *v_, I ldv, 
+#endif
+				  T *u_, I ldu, T *c_,
+                                  int ldc, S *work_, I &info,
+                                  S *dwork = nullptr, hipStream_t stream = 0) {
+
+  bool const use_gpu = (dwork != nullptr);
+
+  S const zero = 0;
+  S const one = 1;
+  S negone = -1;
+  S const hndrd = 100;
+  S const hndrth = one / hndrd;
+  S const ten = 10;
+  S const eight = 8;
+  S const meight = -one / eight;
+  I const maxitr = 6;
+  I ione = 1;
+
+  I nrv = n;
+
+  bool const lower = (uplo == 'L') || (uplo == 'l');
+  bool const upper = (uplo == 'U') || (uplo == 'u');
+  /*
+   *     rotate is true if any singular vectors desired, false otherwise
+   */
+#ifdef USE_VT
+  bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0);
+#else
+  bool const rotate = (nrv > 0) || (nru > 0) || (ncc > 0);
+#endif
+
+  I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0,
+    m = 0, maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0;
+
+  I const nrc = n;  // number of rows in C matrix
+  I const nrvt = n; // number of rows in VT matrix
+  I const ncu = n;  // number of columns in U matrix
+
+  S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0,
+    h = 0, mu = 0, oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0,
+    sinl = 0, sinr = 0, sll = 0, smax = 0, smin = 0, sminl = 0, sminoa = 0,
+    sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0;
+
+  /*     ..
+  *     .. external functions ..
+        logical            lsame
+        double precision   dlamch
+        external           lsame, dlamch
+  *     ..
+  *     .. external subroutines ..
+        external           dlartg, dlas2, dlasq1, dlasr, dlasv2, drot,
+       $                   dscal, dswap, xerbla
+  *     ..
+  *     .. intrinsic functions ..
+        intrinsic          abs, dble, max, min, sign, sqrt
+   */
+
+  auto call_swap_gpu = [=](I n, T &x, I incx, T &y, I incy) {
+    swap_template<S, T, I>(n, &x, incx, &y, incy, stream);
+  };
+
+  auto call_rot_gpu = [=](I n, T &x, I incx, T &y, I incy, S cosl, S sinl) {
+    rot_template<S, T, I>(n, &x, incx, &y, incy, cosl, sinl, stream);
+  };
+
+  auto call_scal_gpu = [=](I n, auto da, T &x, I incx) {
+    scal_template<S, T, I>(n, da, &x, incx, stream);
+  };
+
+  auto call_lasr_gpu = [=](char const side, char const pivot, char const direct,
+                           I const m, I const n, S &c, S &s, T &A, I const lda,
+                           S *const dwork, hipStream_t stream) {
+    bool const is_left_side = (side == 'L') || (side == 'l');
+    auto const mn = (is_left_side) ? m : n;
+    auto const mn_m1 = (mn - 1);
+    S *const dc = dwork;
+    S *const ds = dwork + mn_m1;
+    HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice,
+                             stream));
+    HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice,
+                             stream));
+
+    lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream);
+    HIP_CHECK(hipStreamSynchronize(stream));
+  };
+
+  auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); };
+
+  auto indx2f = [](auto i, auto j, auto ld) -> int64_t {
+    assert((1 <= i) && (i <= ld));
+    assert((1 <= j));
+    return ((i - 1) + (j - 1) * int64_t(ld));
+  };
+
+  auto d = [=](auto i) -> T & {
+    assert((1 <= i) && (i <= n));
+    return (d_[i - 1]);
+  };
+
+  auto e = [=](auto i) -> T & {
+    assert((1 <= i) && (i <= (n - 1)));
+    return (e_[i - 1]);
+  };
+  auto work = [=](auto i) -> S & { return (work_[i - 1]); };
+
+  auto c = [=](auto i, auto j) -> T & {
+    assert((1 <= i) && (i <= nrc) && (nrc <= ldc));
+    assert((1 <= j) && (j <= ncc));
+    return (c_[indx2f(i, j, ldc)]);
+  };
+
+  auto u = [=](auto i, auto j) -> T & {
+    assert((1 <= i) && (i <= nru) && (nru <= ldu));
+    assert((1 <= j) && (j <= ncu));
+    return (u_[indx2f(i, j, ldu)]);
+  };
+
+#ifdef USE_VT
+  auto vt = [=](auto i, auto j) -> T & {
+    assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt));
+    assert((1 <= j) && (j <= ncvt));
+    return (vt_[indx2f(i, j, ldvt)]);
+  };
+#else
+  auto v = [=](auto i, auto j) -> T & {
+    assert((1 <= i) && (i <= nrv) && (nrv <= ldv));
+    assert((1 <= j) && (j <= ncv));
+    return (v_[indx2f(i, j, ldv)]);
+  };
+#endif
+
+  // ---------------------------
+  // emulate Fortran  intrinsics
+  // ---------------------------
+  auto sign = [](auto a, auto b) {
+    auto const abs_a = std::abs(a);
+    return ((b >= 0) ? abs_a : -abs_a);
+  };
+
+  auto dble = [](auto x) { return (static_cast<double>(x)); };
+
+  auto max = [](auto a, auto b) { return ((a > b) ? a : b); };
+
+  auto min = [](auto a, auto b) { return ((a < b) ? a : b); };
+
+  auto sqrt = [](auto x) { return (std::sqrt(x)); };
+
+  /*     ..
+   *     .. executable statements ..
+   *
+   *     test the input parameters.
+   *
+   */
+
+  info = (!upper) && (!lower) ? -1
+         : (n < 0)            ? -2
+#ifdef USE_VT
+         : (ncvt < 0)         ? -3
+#else
+         : (nrv < 0)         ? -3
+#endif
+         : (nru < 0)          ? -4
+         : (ncc < 0)          ? -5
+	 : (ldu < max(1, nru)) ? -11
+	 : 0;
+
+  if (info != 0)
+    return;
+
+  if (n == 0)
+    return;
+  if (n == 1)
+    goto L160;
+  /*
+   *     if no singular vectors desired, use qd algorithm
+   */
+  if (!rotate) {
+    call_lasq1(n, d(1), e(1), work(1), info);
+    /*
+     *     if info equals 2, dqds didn't finish, try to finish
+     */
+    if (info != 2)
+      return;
+    info = 0;
+  }
+
+  nm1 = n - 1;
+  nm12 = nm1 + nm1;
+  nm13 = nm12 + nm1;
+  idir = 0;
+  /*
+   *     get machine constants
+   *
+   */
+  {
+    char cmach_eps = 'E';
+    char cmach_unfl = 'S';
+    call_lamch(cmach_eps, eps);
+    call_lamch(cmach_unfl, unfl);
+  }
+  /*
+   *     if matrix lower bidiagonal, rotate to be upper bidiagonal
+   *     by applying givens rotations on the left
+   */
+
+  if (lower) {
+    // do 10 i = 1, n - 1
+    for (i = 1; i <= (n - 1); i++) {
+      call_lartg(d(i), e(i), cs, sn, r);
+      d(i) = r;
+      e(i) = sn * d(i + 1);
+      d(i + 1) = cs * d(i + 1);
+      work(i) = cs;
+      work(nm1 + i) = sn;
+    }
+  L10:
+
+    /*
+     *        update singular vectors if desired
+     */
+
+    if (nru > 0) {
+      // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu );
+      char side = 'R';
+      char pivot = 'V';
+      char direct = 'F';
+      if (use_gpu) {
+
+        call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1),
+                      ldu, dwork, stream);
+      } else {
+        call_lasr(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu);
+      }
+    }
+    if (ncc > 0) {
+      // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc );
+      char side = 'L';
+      char pivot = 'V';
+      char direct = 'F';
+      if (use_gpu) {
+        call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1),
+                      ldc, dwork, stream);
+      } else {
+        call_lasr(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc);
+      }
+    }
+  }
+  /*
+   *     compute singular values to relative accuracy tol
+   *     (by setting tol to be negative, algorithm will compute
+   *     singular values to absolute accuracy abs(tol)*norm(input matrix))
+   */
+
+  tolmul = max(ten, min(hndrd, pow(eps, meight)));
+  tol = tolmul * eps;
+
+  /*
+   *     compute approximate maximum, minimum singular values
+   */
+
+  /*
+        smax = zero
+        do 20 i = 1, n
+           smax = max( smax, abs( d( i ) ) )
+     L20:
+        do 30 i = 1, n - 1
+           smax = max( smax, abs( e( i ) ) )
+     L30:
+  */
+  smax = zero;
+  // do 20 i = 1, n
+  for (i = 1; i <= n; i++) {
+    smax = max(smax, abs(d(i)));
+  }
+L20:
+  // do 30 i = 1, n - 1
+  for (i = 1; i <= (n - 1); i++) {
+    smax = max(smax, abs(e(i)));
+  }
+L30:
+
+  sminl = zero;
+  if (tol >= zero) {
+    /*
+     *        relative accuracy desired
+     */
+
+    sminoa = abs(d(1));
+    if (sminoa == zero)
+      goto L50;
+    mu = sminoa;
+    // do 40 i = 2, n
+    for (i = 2; i <= n; i++) {
+      mu = abs(d(i)) * (mu / (mu + abs(e(i - 1))));
+      sminoa = min(sminoa, mu);
+      if (sminoa == zero)
+        goto L50;
+    }
+  L40:
+  L50:
+
+    sminoa = sminoa / sqrt(dble(n));
+    thresh = max(tol * sminoa, maxitr * (n * (n * unfl)));
+  } else {
+    /*
+     *        absolute accuracy desired
+     */
+
+    thresh = max(abs(tol) * smax, maxitr * (n * (n * unfl)));
+  }
+  /*
+   *     prepare for main iteration loop for the singular values
+   *     (maxit is the maximum number of passes through the inner
+   *     loop permitted before nonconvergence signalled.)
+   */
+  maxitdivn = maxitr * n;
+  iterdivn = 0;
+  iter = -1;
+  oldll = -1;
+  oldm = -1;
+  /*
+   *     m points to last element of unconverged part of matrix
+   */
+  m = n;
+  /*
+   *     begin main iteration loop
+   */
+L60:
+  /*
+   *     check for convergence or exceeding iteration count
+   */
+  if (m <= 1)
+    goto L160;
+
+  if (iter >= n) {
+    iter = iter - n;
+    iterdivn = iterdivn + 1;
+    if (iterdivn >= maxitdivn)
+      goto L200;
+  }
+  /*
+   *     find diagonal block of matrix to work on
+   */
+  if (tol < zero && abs(d(m)) <= thresh)
+    d(m) = zero;
+
+  smax = abs(d(m));
+  smin = smax;
+  // do 70 lll = 1, m - 1
+  for (lll = 1; lll <= (m - 1); lll++) {
+    ll = m - lll;
+    abss = abs(d(ll));
+    abse = abs(e(ll));
+    if (tol < zero && abss <= thresh)
+      d(ll) = zero;
+    if (abse <= thresh)
+      goto L80;
+    smin = min(smin, abss);
+    smax = max(smax, max(abss, abse));
+  }
+L70:
+  ll = 0;
+  goto L90;
+L80:
+  e(ll) = zero;
+  /*
+   *     matrix splits since e(ll) = 0
+   */
+  if (ll == m - 1) {
+    /*
+     *        convergence of bottom singular value, return to top of loop
+     */
+    m = m - 1;
+    goto L60;
+  }
+L90:
+  ll = ll + 1;
+  /*
+   *     e(ll) through e(m-1) are nonzero, e(ll-1) is zero
+   */
+  if (ll == m - 1) {
+    /*
+     *        2 by 2 block, handle separately
+     */
+    call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl);
+    d(m - 1) = sigmx;
+    e(m - 1) = zero;
+    d(m) = sigmn;
+    /*
+     *        compute singular vectors, if desired
+     */
+#ifdef USE_VT
+    if (ncvt > 0) {
+      if (use_gpu) {
+        call_rot_gpu(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr);
+      } else {
+        call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr);
+      }
+    }
+#else
+    if (nrv > 0) {
+      if (use_gpu) {
+        call_rot_gpu(nrv, v(1,m - 1), 1, v(1,m), 1, cosr, sinr);
+      } else {
+        call_rot(nrv, v(1,m - 1), 1, v(1,m), 1, cosr, sinr);
+      }
+    }
+#endif
+    if (nru > 0) {
+      if (use_gpu) {
+        call_rot_gpu(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl);
+      } else {
+        call_rot(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl);
+      }
+    }
+    if (ncc > 0) {
+      if (use_gpu) {
+        call_rot_gpu(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl);
+      } else {
+        call_rot(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl);
+      }
+    }
+    m = m - 2;
+    goto L60;
+  }
+  /*
+   *     if working on new submatrix, choose shift direction
+   *     (from larger end diagonal element towards smaller)
+   */
+  if (ll > oldm || m < oldll) {
+    if (abs(d(ll)) >= abs(d(m))) {
+      /*
+       *           chase bulge from top (big end) to bottom (small end)
+       */
+      idir = 1;
+    } else {
+      /*
+       *           chase bulge from bottom (big end) to top (small end)
+       */
+      idir = 2;
+    }
+  }
+  /*
+   *     apply convergence tests
+   */
+  if (idir == 1) {
+    /*
+     *        run convergence test in forward direction
+     *        first apply standard test to bottom of matrix
+     */
+    if (abs(e(m - 1)) <= abs(tol) * abs(d(m)) ||
+        (tol < zero && abs(e(m - 1)) <= thresh)) {
+      e(m - 1) = zero;
+      goto L60;
+    }
+
+    if (tol >= zero) {
+      /*
+       *           if relative accuracy desired,
+       *           apply convergence criterion forward
+       */
+      mu = abs(d(ll));
+      sminl = mu;
+      // do 100 lll = ll, m - 1
+      for (lll = ll; lll <= (m - 1); lll++) {
+        if (abs(e(lll)) <= tol * mu) {
+          e(lll) = zero;
+          goto L60;
+        }
+        mu = abs(d(lll + 1)) * (mu / (mu + abs(e(lll))));
+        sminl = min(sminl, mu);
+      }
+      // L100:
+    }
+
+  } else {
+    /*
+     *        run convergence test in backward direction
+     *        first apply standard test to top of matrix
+     */
+    if (abs(e(ll)) <= abs(tol) * abs(d(ll)) ||
+        (tol < zero && abs(e(ll)) <= thresh)) {
+      e(ll) = zero;
+      goto L60;
+    }
+
+    if (tol >= zero) {
+      /*
+       *           if relative accuracy desired,
+       *           apply convergence criterion backward
+       */
+      mu = abs(d(m));
+      sminl = mu;
+      // do 110 lll = m - 1, ll, -1
+      for (lll = (m - 1); lll >= ll; lll--) {
+        if (abs(e(lll)) <= tol * mu) {
+          e(lll) = zero;
+          goto L60;
+        }
+        mu = abs(d(lll)) * (mu / (mu + abs(e(lll))));
+        sminl = min(sminl, mu);
+      }
+      // L110:
+    }
+  }
+  oldll = ll;
+  oldm = m;
+  /*
+   *     compute shift.  first, test if shifting would ruin relative
+   *     accuracy, and if so set the shift to zero.
+   */
+  if (tol >= zero && n * tol * (sminl / smax) <= max(eps, hndrth * tol)) {
+    /*
+     *        use a zero shift to avoid loss of relative accuracy
+     */
+    shift = zero;
+  } else {
+    /*
+     *        compute the shift from 2-by-2 block at end of matrix
+     */
+    if (idir == 1) {
+      sll = abs(d(ll));
+      call_las2(d(m - 1), e(m - 1), d(m), shift, r);
+    } else {
+      sll = abs(d(m));
+      call_las2(d(ll), e(ll), d(ll + 1), shift, r);
+    }
+    /*
+     *        test if shift negligible, and if so set to zero
+     */
+    if (sll > zero) {
+      if ((shift / sll) * (shift / sll) < eps)
+        shift = zero;
+    }
+  }
+  /*
+   *     increment iteration count
+   */
+  iter = iter + m - ll;
+  /*
+   *     if shift = 0, do simplified qr iteration
+   */
+  if (shift == zero) {
+    if (idir == 1) {
+      /*
+       *           chase bulge from top to bottom
+       *           save cosines and sines for later singular vector updates
+       */
+      cs = one;
+      oldcs = one;
+      // do 120 i = ll, m - 1
+      for (i = ll; i <= (m - 1); i++) {
+        auto di_cs = d(i) * cs;
+        call_lartg(di_cs, e(i), cs, sn, r);
+        if (i > ll)
+          e(i - 1) = oldsn * r;
+        auto oldcs_r = oldcs * r;
+        auto dip1_sn = d(i + 1) * sn;
+        call_lartg(oldcs_r, dip1_sn, oldcs, oldsn, d(i));
+        work(i - ll + 1) = cs;
+        work(i - ll + 1 + nm1) = sn;
+        work(i - ll + 1 + nm12) = oldcs;
+        work(i - ll + 1 + nm13) = oldsn;
+      }
+    L120:
+      h = d(m) * cs;
+      d(m) = h * oldcs;
+      e(m - 1) = h * oldsn;
+      /*
+       *           update singular vectors
+       */
+#ifdef USE_VT
+      if (ncvt > 0) {
+        // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
+        // ll, 1 ), ldvt )
+        char side = 'L';
+        char pivot = 'V';
+        char direct = 'F';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n),
+                        vt(ll, 1), ldvt, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1),
+                    ldvt);
+        }
+      }
+#else
+
+      if (nrv > 0) {
+        // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), 
+	//            v(1,ll ), ldv)
+        char side = 'R';
+        char pivot = 'V';
+        char direct = 'F';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n),
+                        v(1,ll), ldv, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1,ll),
+                    ldv);
+        }
+      }
+#endif
+      if (nru > 0) {
+        // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
+        // ), u( 1, ll ), ldu )
+        char side = 'R';
+        char pivot = 'V';
+        char direct = 'F';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1),
+                        work(nm13 + 1), u(1, ll), ldu, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1),
+                    work(nm13 + 1), u(1, ll), ldu);
+        }
+      }
+      if (ncc > 0) {
+        // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
+        // ), c( ll, 1 ), ldc )
+        char side = 'L';
+        char pivot = 'V';
+        char direct = 'F';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1),
+                        work(nm13 + 1), c(ll, 1), ldc, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1),
+                    work(nm13 + 1), c(ll, 1), ldc);
+        }
+      }
+      /*
+       *           test convergence
+       */
+      if (abs(e(m - 1)) <= thresh)
+        e(m - 1) = zero;
+
+    } else {
+      /*
+       *           chase bulge from bottom to top
+       *           save cosines and sines for later singular vector updates
+       */
+      cs = one;
+      oldcs = one;
+      // do 130 i = m, ll + 1, -1
+      for (i = m; i >= (ll + 1); i--) {
+        auto di_cs = d(i) * cs;
+        call_lartg(di_cs, e(i - 1), cs, sn, r);
+
+        if (i < m)
+          e(i) = oldsn * r;
+
+        auto oldcs_r = oldcs * r;
+        auto dim1_sn = d(i - 1) * sn;
+        call_lartg(oldcs_r, dim1_sn, oldcs, oldsn, d(i));
+
+        work(i - ll) = cs;
+        work(i - ll + nm1) = -sn;
+        work(i - ll + nm12) = oldcs;
+        work(i - ll + nm13) = -oldsn;
+      }
+    L130:
+      h = d(ll) * cs;
+      d(ll) = h * oldcs;
+      e(ll) = h * oldsn;
+      /*
+       *           update singular vectors
+       */
+#ifdef USE_VT
+      if (ncvt > 0) {
+        // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
+        // nm13+1
+        // ), vt( ll, 1 ), ldvt );
+        char side = 'L';
+        char pivot = 'V';
+        char direct = 'B';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1),
+                        work(nm13 + 1), vt(ll, 1), ldvt, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1),
+                    work(nm13 + 1), vt(ll, 1), ldvt);
+        }
+      }
+#else
+
+      if (nrv > 0) {
+        // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv );
+        char side = 'R';
+        char pivot = 'V';
+        char direct = 'B';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1),
+                        work(nm13 + 1), v(1,ll), ldv, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1),
+                    work(nm13 + 1), v(1,ll), ldv);
+        }
+      }
+
+
+#endif
+
+      if (nru > 0) {
+        // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
+        // ll
+        // ), ldu )
+        char side = 'R';
+        char pivot = 'V';
+        char direct = 'B';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n),
+                        u(1, ll), ldu, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll),
+                    ldu);
+        }
+      }
+      if (ncc > 0) {
+        // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
+        // 1
+        // ), ldc )
+        char side = 'L';
+        char pivot = 'V';
+        char direct = 'B';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n),
+                        c(ll, 1), ldc, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1),
+                    ldc);
+        }
+      }
+      /*
+       *           test convergence
+       */
+      if (abs(e(ll)) <= thresh)
+        e(ll) = zero;
+    }
+  } else {
+    /*
+     *        use nonzero shift
+     */
+    if (idir == 1) {
+      /*
+       *           chase bulge from top to bottom
+       *           save cosines and sines for later singular vector updates
+       */
+      f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll));
+      g = e(ll);
+      // do 140 i = ll, m - 1
+      for (i = ll; i <= (m - 1); i++) {
+        call_lartg(f, g, cosr, sinr, r);
+        if (i > ll)
+          e(i - 1) = r;
+        f = cosr * d(i) + sinr * e(i);
+        e(i) = cosr * e(i) - sinr * d(i);
+        g = sinr * d(i + 1);
+        d(i + 1) = cosr * d(i + 1);
+        call_lartg(f, g, cosl, sinl, r);
+        d(i) = r;
+        f = cosl * e(i) + sinl * d(i + 1);
+        d(i + 1) = cosl * d(i + 1) - sinl * e(i);
+        if (i < m - 1) {
+          g = sinl * e(i + 1);
+          e(i + 1) = cosl * e(i + 1);
+        }
+        work(i - ll + 1) = cosr;
+        work(i - ll + 1 + nm1) = sinr;
+        work(i - ll + 1 + nm12) = cosl;
+        work(i - ll + 1 + nm13) = sinl;
+      }
+    L140:
+      e(m - 1) = f;
+      /*
+       *           update singular vectors
+       */
+#ifdef USE_VT
+      if (ncvt > 0) {
+        // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
+        // ll, 1 ), ldvt )
+        char side = 'L';
+        char pivot = 'V';
+        char direct = 'F';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n),
+                        vt(ll, 1), ldvt, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1),
+                    ldvt);
+        }
+      }
+#else
+
+      if (nrv > 0) {
+        // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), v( 1, ll ), ldv )
+        char side = 'R';
+        char pivot = 'V';
+        char direct = 'F';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n),
+                        v(1,ll), ldv, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1,ll),
+                    ldv);
+        }
+      }
+#endif
+
+      if (nru > 0) {
+        // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
+        // ), u( 1, ll ), ldu )
+        char side = 'R';
+        char pivot = 'V';
+        char direct = 'F';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1),
+                        work(nm13 + 1), u(1, ll), ldu, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1),
+                    work(nm13 + 1), u(1, ll), ldu);
+        }
+      }
+      if (ncc > 0) {
+        // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
+        // ), c( ll, 1 ), ldc )
+        char side = 'L';
+        char pivot = 'V';
+        char direct = 'F';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1),
+                        work(nm13 + 1), c(ll, 1), ldc, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1),
+                    work(nm13 + 1), c(ll, 1), ldc);
+        }
+      }
+      /*
+       *           test convergence
+       */
+      if (abs(e(m - 1)) <= thresh)
+        e(m - 1) = zero;
+
+    } else {
+      /*
+       *           chase bulge from bottom to top
+       *           save cosines and sines for later singular vector updates
+       */
+      f = (abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m));
+      g = e(m - 1);
+      // do 150 i = m, ll + 1, -1
+      for (i = m; i >= (ll + 1); i--) {
+        call_lartg(f, g, cosr, sinr, r);
+        if (i < m)
+          e(i) = r;
+        f = cosr * d(i) + sinr * e(i - 1);
+        e(i - 1) = cosr * e(i - 1) - sinr * d(i);
+        g = sinr * d(i - 1);
+        d(i - 1) = cosr * d(i - 1);
+        call_lartg(f, g, cosl, sinl, r);
+        d(i) = r;
+        f = cosl * e(i - 1) + sinl * d(i - 1);
+        d(i - 1) = cosl * d(i - 1) - sinl * e(i - 1);
+        if (i > ll + 1) {
+          g = sinl * e(i - 2);
+          e(i - 2) = cosl * e(i - 2);
+        }
+        work(i - ll) = cosr;
+        work(i - ll + nm1) = -sinr;
+        work(i - ll + nm12) = cosl;
+        work(i - ll + nm13) = -sinl;
+      }
+    L150:
+      e(ll) = f;
+      /*
+       *           test convergence
+       */
+      if (abs(e(ll)) <= thresh)
+        e(ll) = zero;
+      /*
+       *           update singular vectors if desired
+       */
+#ifdef USE_VT
+      if (ncvt > 0) {
+        // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
+        // nm13+1
+        // ), vt( ll, 1 ), ldvt )
+        char side = 'L';
+        char pivot = 'V';
+        char direct = 'B';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1),
+                        work(nm13 + 1), vt(ll, 1), ldvt, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1),
+                    work(nm13 + 1), vt(ll, 1), ldvt);
+        }
+      }
+#else
+
+      if (nrv > 0) {
+        // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv )
+        char side = 'L';
+        char pivot = 'V';
+        char direct = 'B';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1),
+                        work(nm13 + 1), v(1,ll), ldv, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1),
+                    work(nm13 + 1), v(1,ll), ldv);
+        }
+      }
+#endif
+      if (nru > 0) {
+        // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
+        // ll
+        // ), ldu )
+        char side = 'R';
+        char pivot = 'V';
+        char direct = 'B';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n),
+                        u(1, ll), ldu, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll),
+                    ldu);
+        }
+      }
+      if (ncc > 0) {
+        // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
+        // 1
+        // ), ldc )
+        char side = 'L';
+        char pivot = 'V';
+        char direct = 'B';
+        auto mm = m - ll + 1;
+        if (use_gpu) {
+          call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n),
+                        c(ll, 1), ldc, dwork, stream);
+        } else {
+          call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1),
+                    ldc);
+        }
+      }
+    }
+  }
+  /*
+   *     qr iteration finished, go back and check convergence
+   */
+  goto L60;
+
+/*
+ *     all singular values converged, so make them positive
+ */
+L160:
+  // do 170 i = 1, n
+  for (i = 1; i <= n; i++) {
+    if (d(i) < zero) {
+      d(i) = -d(i);
+      /*
+       *           change sign of singular vectors, if desired
+       */
+#ifdef USE_VT
+      if (ncvt > 0) {
+        if (use_gpu) {
+          call_scal_gpu(ncvt, negone, vt(i, 1), ldvt);
+        } else {
+          call_scal(ncvt, negone, vt(i, 1), ldvt);
+        }
+      }
+#else
+
+      if (nrv > 0) {
+        if (use_gpu) {
+          call_scal_gpu(nrv, negone, v(1,i), 1);
+        } else {
+          call_scal(nrv, negone, v(1,i), 1);
+        }
+      }
+#endif
+    }
+  }
+L170:
+
+  bool const need_sort = false;
+  if (need_sort) {
+  /*
+   *     sort the singular values into decreasing order (insertion sort on
+   *     singular values, but only one transposition per singular vector)
+   */
+  // do 190 i = 1, n - 1
+  for (i = 1; i <= (n - 1); i++) {
+    /*
+     *        scan for smallest d(i)
+     */
+    isub = 1;
+    smin = d(1);
+    // do 180 j = 2, n + 1 - i
+    for (j = 2; j <= (n + 1 - i); j++) {
+      if (d(j) <= smin) {
+        isub = j;
+        smin = d(j);
+      }
+    }
+  L180:
+    if (isub != n + 1 - i) {
+      /*
+       *           swap singular values and vectors
+       */
+      d(isub) = d(n + 1 - i);
+      d(n + 1 - i) = smin;
+#ifdef USE_VT
+      if (ncvt > 0) {
+        if (use_gpu) {
+          call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+        } else {
+          call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+        }
+      }
+#else
+
+      if (nrv > 0) {
+        if (use_gpu) {
+          call_swap_gpu(nrv, v(1,isub), 1, v(1,n + 1 - i), 1);
+        } else {
+          call_swap(nrv, v(1,isub), 1, v(1,n + 1 - i), 1);
+        }
+      }
+#endif
+      if (nru > 0) {
+        if (use_gpu) {
+          call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
+        } else {
+          call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
+        }
+      }
+      if (ncc > 0) {
+        if (use_gpu) {
+          call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+        } else {
+          call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+        }
+      }
+    }
+  }
+  } // end if (need_sort)
+
+L190:
+  goto L220;
+/*
+ *     maximum number of iterations exceeded, failure to converge
+ */
+L200:
+  info = 0;
+  // do 210 i = 1, n - 1
+  for (i = 1; i <= (n - 1); i++) {
+    if (e(i) != zero)
+      info = info + 1;
+  }
+L210:
+L220:
+  return;
+  /*
+   *     end of dbdsqr
+   */
+}
+
+
+
+
+template <typename T, typename S, typename W1, typename W2, typename W3>
+rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
+                                        const rocblas_fill uplo_in,
+                                        const rocblas_int n,
+                                        const rocblas_int nv,
+                                        const rocblas_int nu,
+                                        const rocblas_int nc,
+                                        S* D,
+                                        const rocblas_stride strideD,
+                                        S* E,
+                                        const rocblas_stride strideE,
+                                        W1 V,
+                                        const rocblas_int shiftV,
+                                        const rocblas_int ldv,
+                                        const rocblas_stride strideV,
+                                        W2 U,
+                                        const rocblas_int shiftU,
+                                        const rocblas_int ldu,
+                                        const rocblas_stride strideU,
+                                        W3 C,
+                                        const rocblas_int shiftC,
+                                        const rocblas_int ldc,
+                                        const rocblas_stride strideC,
+                                        rocblas_int* info_array,
+                                        const rocblas_int batch_count,
+                                        rocblas_int* splits_map,
+                                        S* work)
+{
+
+  // -------------------------
+  // copy D into hD, E into hE
+  // -------------------------
+
+hipStream_t stream;
+rocblas_get_stream(handle, &stream );
+
+
+
+S * hD = nullptr;
+S * hE = nullptr;
+HIP_CHECK( hipHostMalloc( &hD, sizeof(S) * n * batch_count ));
+HIP_CHECK( hipHostMalloc( &hE, sizeof(S) * (n-1) * batch_count ));
+
+bool const use_single_copy_for_D = (strideD == n);
+if (use_single_copy_for_D) {
+	void * const dst = (void *) &(hD[0]);
+	void * const src = (void *) D;
+	size_t const sizeBytes = sizeof(S) * n * batch_count;
+	hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+
+	HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
+}
+else {
+   for(rocblas_int bid = 0; bid < batch_count; bid++) {
+	   void * const dst = (void *) &(hD[ bid * n ]);
+	   void * const src = (void *) D+ bid * strideD; 
+	   size_t const sizeBytes = sizeof(S) * n;
+	   hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+	   HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
+   }
+}
+
+bool const use_single_copy_for_E = (strideE == (n-1));
+if (use_single_copy_for_E) {
+	void * const dst = (void *) &(hE[0]);
+	void * const src = (void *) E;
+	size_t const sizeBytes = sizeof(S) * (n-1) * batch_count;
+	hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+
+	HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
+}
+else {
+  for(rocblas_int bid = 0; bid < batch_count; bid++) {
+	   void * const dst = (void *) &(hE[ bid * (n-1) ]);
+	   void * const src = (void *) E+ bid * strideE; 
+	   size_t const sizeBytes = sizeof(S) * (n-1);
+	   hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+	   HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
+  }
+
+}
+
+
+
+
+  for(rocblas_int bid = 0; bid < batch_count; bid++) {
+	  std::vector<S> hwork( 4 * n );
+
+	  char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U';
+          T *d_ =  &(hD[ bid * n ] );
+	  T *e_ =  &(hE[ bid * (n-1) ]);
+
+          T *v_ = (nv > 0) ? load_ptr_batch<T>(V, bid, shiftV, strideV) : nullptr;
+	  T *u_ = (nu > 0) ? load_ptr_batch<T>(U, bid, shiftU, strideU) : nullptr;
+	  T *c_ = (nc > 0) ? load_ptr_batch<T>(C, bid, shiftC, strideC) : nullptr;
+	  S * work_ =  &(hwork[0]);
+	  S * dwork =  work;
+	  I info = 0;
+
+	  I ncv = nv;
+	  I nru = nu;
+	  I ncc = nc;
+
+#ifdef USE_VT
+	  T *vt_ = v_;
+	  I ldvt = ldv;
+
+	  I nrv = n;
+	  I ncvt = nrv;
+	  bdsqr_single_template( uplo, n, ncvt, nru, ncc, d_, e_, 
+			  vt_,ldvt,
+			  u_,ldu, c_,ldc, work_, info, dwork, stream );
+#else
+
+	  bdsqr_single_template( uplo, n, ncv, nru, ncc, d_, e_, 
+			  v_,ldv,
+			  u_,ldu, c_,ldc, work_, info, dwork, stream );
+
+#endif
+   
+
+
+  info_array[bid] = info;
+  } // end for bid
+
+
+if (use_single_copy_for_D) {
+	void * const src = (void *) &(hD[0]);
+	void * const dst = (void *) D;
+	size_t const sizeBytes = sizeof(S) * n * batch_count;
+	hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+	HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
+}
+else {
+   for(rocblas_int bid = 0; bid < batch_count; bid++) {
+	   void * const src = (void *) &(hD[ bid * n ]);
+	   void * const dst = (void *) D+ bid * strideD; 
+	   size_t const sizeBytes = sizeof(S) * n;
+	   hipMemcpyKind const kind = hipMemcpyHostToDevice;
+	   HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
+   }
+}
+
+
+if (use_single_copy_for_E) {
+	void * const src = (void *) &(hE[0]);
+	void * const dst = (void *) E;
+	size_t const sizeBytes = sizeof(S) * (n-1) * batch_count;
+	hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+	HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
+}
+else {
+  for(rocblas_int bid = 0; bid < batch_count; bid++) {
+	   void * const src = (void *) &(hE[ bid * (n-1) ]);
+	   void * const dst = (void *) E+ bid * strideE; 
+	   size_t const sizeBytes = sizeof(S) * (n-1);
+	   hipMemcpyKind const kind = hipMemcpyHostToDevice;
+	   HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
+  }
+
+}
+
+HIP_CHECK( hipHostFree( hD ) ); hD = nullptr;
+HIP_CHECK( hipHostFree( hE ) ); hE = nullptr;
+
+return( rocblas_status_success );
+
+}
+
+
+ROCSOLVER_END_NAMESPACE

From 3e40d99eb2d10d6c97c3169bbff8c1542de06d04 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Wed, 29 May 2024 12:29:03 -0400
Subject: [PATCH 02/35] snapshot compile ok

---
 library/src/auxiliary/bdsqr_host.hpp         | 3622 ++++++++++--------
 library/src/auxiliary/rocauxiliary_bdsqr.hpp |   38 +-
 2 files changed, 2064 insertions(+), 1596 deletions(-)

diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp
index d40be74ab..5d3b0f6c9 100644
--- a/library/src/auxiliary/bdsqr_host.hpp
+++ b/library/src/auxiliary/bdsqr_host.hpp
@@ -43,823 +43,1110 @@
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
-
 ROCSOLVER_BEGIN_NAMESPACE
 
 #ifndef HIP_CHECK
-#define HIP_CHECK(fcn)                                                         \
-  {                                                                            \
-    hipError_t const istat = (fcn);                                            \
-    assert(istat == hipSuccess);                                               \
-  }
+#define HIP_CHECK(fcn)                  \
+    {                                   \
+        hipError_t const istat = (fcn); \
+        assert(istat == hipSuccess);    \
+    }
 #endif
 
-
 template <typename S, typename T, typename I>
-__global__ static void lasr_kernel(char const side, char const pivot,
-                                        char const direct, I const m, I const n,
-                                        S const *const c_, S const *const s_,
-                                        T *const A_, I const lda) {
-
-  const auto nblocks = hipGridDim_x;
-  const auto nthreads_per_block = hipBlockDim_x;
-  const auto nthreads = nblocks * nthreads_per_block;
-  const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-  const auto i_inc = nthreads;
-  const auto ij_nb = nthreads;
-  const auto ij_start = tid;
-
-  auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
-  auto min = [](auto x, auto y) { return ((x < y) ? x : y); };
-
-  auto indx2f = [](auto i, auto j, auto lda) -> int64_t {
-    assert((1 <= i));
-    assert((1 <= lda));
-    assert((1 <= j));
-
-    return ((i - 1) + (j - 1) * int64_t(lda));
-  };
-
-  auto indx1f = [](auto i) -> int64_t {
-    assert((1 <= i));
-    return (i - int64_t(1));
-  };
-
-  auto c = [&](auto i) -> const S & { return (c_[indx1f(i)]); };
-  auto s = [&](auto i) -> const S & { return (s_[indx1f(i)]); };
-  auto A = [&](auto i, auto j) -> T & { return (A_[indx2f(i, j, lda)]); };
-
-  const S one = 1;
-  const S zero = 0;
-
-  // ----------------
-  // check arguments
-  // ----------------
-
-  const bool is_side_Left = (side == 'L') || (side == 'l');
-  const bool is_side_Right = (side == 'R') || (side == 'r');
-
-  const bool is_pivot_Variable = (pivot == 'V') || (pivot == 'v');
-  const bool is_pivot_Bottom = (pivot == 'B') || (pivot == 'b');
-  const bool is_pivot_Top = (pivot == 'T') || (pivot == 't');
-
-  const bool is_direct_Forward = (direct == 'F') || (direct == 'f');
-  const bool is_direct_Backward = (direct == 'B') || (direct == 'b');
-
-  {
-    const bool isok_side = is_side_Left || is_side_Right;
-    const bool isok_pivot =
-        is_pivot_Variable || is_pivot_Bottom || is_pivot_Top;
-    const bool isok_direct = is_direct_Forward || is_direct_Backward;
-
-    const I info = (!isok_side)        ? 1
-                   : (!isok_pivot)     ? 2
-                   : (!isok_direct)    ? 3
-                   : (m < 0)           ? 4
-                   : (n < 0)           ? 5
-                   : (c_ == nullptr)   ? 6
-                   : (s_ == nullptr)   ? 7
-                   : (A_ == nullptr)   ? 8
-                   : (lda < max(1, m)) ? 9
-                                       : 0;
-    if (info != 0)
-      return;
-  };
-
-  {
-    const bool has_work = (m >= 1) && (n >= 1);
-    if (!has_work) {
-      return;
+__global__ static void lasr_kernel(char const side,
+                                   char const pivot,
+                                   char const direct,
+                                   I const m,
+                                   I const n,
+                                   S const* const c_,
+                                   S const* const s_,
+                                   T* const A_,
+                                   I const lda)
+{
+    const auto nblocks = hipGridDim_x;
+    const auto nthreads_per_block = hipBlockDim_x;
+    const auto nthreads = nblocks * nthreads_per_block;
+    const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    const auto i_inc = nthreads;
+    const auto ij_nb = nthreads;
+    const auto ij_start = tid;
+
+    auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
+    auto min = [](auto x, auto y) { return ((x < y) ? x : y); };
+
+    auto indx2f = [](auto i, auto j, auto lda) -> int64_t {
+        assert((1 <= i));
+        assert((1 <= lda));
+        assert((1 <= j));
+
+        return ((i - 1) + (j - 1) * int64_t(lda));
     };
-  };
-
-  if (is_side_Left && is_pivot_Variable && is_direct_Forward) {
-    //  -----------------------------
-    //  A := P*A
-    //  Variable pivot, the plane (k,k+1)
-    //  P = P(z-1) * ... * P(2) * P(1)
-    //  -----------------------------
-    {
 
-      for (I j = 1; j <= (m - 1); j++) {
-        const auto ctemp = c(j);
-        const auto stemp = s(j);
-        if ((ctemp != one) || (stemp != zero)) {
-          for (I i = 1 + tid; i <= n; i += i_inc) {
-            const auto temp = A(j + 1, i);
-            A(j + 1, i) = ctemp * temp - stemp * A(j, i);
-            A(j, i) = stemp * temp + ctemp * A(j, i);
-          }
-        };
-      };
+    auto indx1f = [](auto i) -> int64_t {
+        assert((1 <= i));
+        return (i - int64_t(1));
     };
 
-    return;
-  };
-
-  if (is_side_Left && is_pivot_Variable && is_direct_Backward) {
-    //  -----------------------------
-    //  A := P*A
-    //  Variable pivot, the plane (k,k+1)
-    //  P = P(1)*P(2)*...*P(z-1)
-    //  -----------------------------
-
-    auto const jend = (m - 1);
-    auto const jstart = 1;
-    auto const istart = 1;
-    auto const iend = n;
-
-    for (I j = jend; j >= jstart; j--) {
-      const auto ctemp = c(j);
-      const auto stemp = s(j);
-      if ((ctemp != one) || (stemp != zero)) {
-        for (I i = istart + tid; i <= iend; i += i_inc) {
-          const auto temp = A(j + 1, i);
-          A(j + 1, i) = ctemp * temp - stemp * A(j, i);
-          A(j, i) = stemp * temp + ctemp * A(j, i);
-        };
-      };
-    };
+    auto c = [&](auto i) -> const S& { return (c_[indx1f(i)]); };
+    auto s = [&](auto i) -> const S& { return (s_[indx1f(i)]); };
+    auto A = [&](auto i, auto j) -> T& { return (A_[indx2f(i, j, lda)]); };
 
-    return;
-  };
-
-  if (is_side_Left && is_pivot_Top && is_direct_Forward) {
-    //  -----------------------------
-    //  A := P*A
-    //  Top pivot, the plane (1,k+1)
-    //  P = P(z-1) * ... * P(2) * P(1)
-    //  -----------------------------
-    {
-      for (I j = 2; j <= m; j++) {
-        const auto ctemp = c(j - 1);
-        const auto stemp = s(j - 1);
-        for (I i = 1 + tid; i <= n; i += i_inc) {
-          const auto temp = A(j, i);
-          A(j, i) = ctemp * temp - stemp * A(1, i);
-          A(1, i) = stemp * temp + ctemp * A(1, i);
-        };
-      };
-    };
+    const S one = 1;
+    const S zero = 0;
 
-    return;
-  };
-
-  if (is_side_Left && is_pivot_Top && is_direct_Backward) {
-    //  -----------------------------
-    //  A := P*A
-    //  Top pivot, the plane (1,k+1)
-    //  P = P(1)*P(2)*...*P(z-1)
-    //  -----------------------------
-    {
+    // ----------------
+    // check arguments
+    // ----------------
 
-      auto const jend = m;
-      auto const jstart = 2;
-      auto const istart = 1;
-      auto const iend = n;
+    const bool is_side_Left = (side == 'L') || (side == 'l');
+    const bool is_side_Right = (side == 'R') || (side == 'r');
 
-      for (I j = jend; j >= jstart; j--) {
-        const auto ctemp = c(j - 1);
-        const auto stemp = s(j - 1);
-        if ((ctemp != one) || (stemp != zero)) {
-          for (I i = istart + tid; i <= iend; i += i_inc) {
-            const auto temp = A(j, i);
+    const bool is_pivot_Variable = (pivot == 'V') || (pivot == 'v');
+    const bool is_pivot_Bottom = (pivot == 'B') || (pivot == 'b');
+    const bool is_pivot_Top = (pivot == 'T') || (pivot == 't');
 
-            A(j, i) = ctemp * temp - stemp * A(1, i);
-            A(1, i) = stemp * temp + ctemp * A(1, i);
-          };
-        };
-      };
-    }
+    const bool is_direct_Forward = (direct == 'F') || (direct == 'f');
+    const bool is_direct_Backward = (direct == 'B') || (direct == 'b');
 
-    return;
-  };
-
-  if (is_side_Left && is_pivot_Bottom && is_direct_Forward) {
-    //  -----------------------------
-    //  A := P*A
-    //  Bottom pivot, the plane (k,z)
-    //  P = P(z-1) * ... * P(2) * P(1)
-    //  -----------------------------
     {
+        const bool isok_side = is_side_Left || is_side_Right;
+        const bool isok_pivot = is_pivot_Variable || is_pivot_Bottom || is_pivot_Top;
+        const bool isok_direct = is_direct_Forward || is_direct_Backward;
+
+        const I info = (!isok_side) ? 1
+            : (!isok_pivot)         ? 2
+            : (!isok_direct)        ? 3
+            : (m < 0)               ? 4
+            : (n < 0)               ? 5
+            : (c_ == nullptr)       ? 6
+            : (s_ == nullptr)       ? 7
+            : (A_ == nullptr)       ? 8
+            : (lda < max(1, m))     ? 9
+                                    : 0;
+        if(info != 0)
+            return;
+    };
 
-      auto const jstart = 1;
-      auto const jend = (m - 1);
-      auto const istart = 1;
-      auto const iend = n;
-
-      for (I j = jstart; j <= jend; j++) {
-        const auto ctemp = c(j);
-        const auto stemp = s(j);
-        if ((ctemp != one) || (stemp != zero)) {
-          for (I i = istart + tid; i <= iend; i += i_inc) {
-            const auto temp = A(j, i);
-            A(j, i) = stemp * A(m, i) + ctemp * temp;
-            A(m, i) = ctemp * A(m, i) - stemp * temp;
-          };
+    {
+        const bool has_work = (m >= 1) && (n >= 1);
+        if(!has_work)
+        {
+            return;
         };
-      };
-    }
+    };
 
-    return;
-  };
-
-  if (is_side_Left && is_pivot_Bottom && is_direct_Backward) {
-    //  -----------------------------
-    //  A := P*A
-    //  Bottom pivot, the plane (k,z)
-    //  P = P(1)*P(2)*...*P(z-1)
-    //  -----------------------------
+    if(is_side_Left && is_pivot_Variable && is_direct_Forward)
     {
-
-      auto const jend = (m - 1);
-      auto const jstart = 1;
-      auto const istart = 1;
-      auto const iend = n;
-
-      for (I j = jend; j >= jstart; j--) {
-        const auto ctemp = c(j);
-        const auto stemp = s(j);
-        if ((ctemp != one) || (stemp != zero)) {
-
-          for (I i = istart + tid; i <= iend; i += i_inc) {
-            const auto temp = A(j, i);
-            A(j, i) = stemp * A(m, i) + ctemp * temp;
-            A(m, i) = ctemp * A(m, i) - stemp * temp;
-          };
+        //  -----------------------------
+        //  A := P*A
+        //  Variable pivot, the plane (k,k+1)
+        //  P = P(z-1) * ... * P(2) * P(1)
+        //  -----------------------------
+        {
+            for(I j = 1; j <= (m - 1); j++)
+            {
+                const auto ctemp = c(j);
+                const auto stemp = s(j);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = 1 + tid; i <= n; i += i_inc)
+                    {
+                        const auto temp = A(j + 1, i);
+                        A(j + 1, i) = ctemp * temp - stemp * A(j, i);
+                        A(j, i) = stemp * temp + ctemp * A(j, i);
+                    }
+                };
+            };
         };
-      };
-    }
-
-    return;
-  };
 
-  if (is_side_Right && is_pivot_Variable && is_direct_Forward) {
-    //  -----------------------------
-    //  A := A*P**T
-    //  Variable pivot, the plane (k,k+1)
-    //  P = P(z-1) * ... * P(2) * P(1)
-    //  -----------------------------
+        return;
+    };
 
+    if(is_side_Left && is_pivot_Variable && is_direct_Backward)
     {
-
-      auto const jstart = 1;
-      auto const jend = (n - 1);
-      auto const istart = 1;
-      auto const iend = m;
-
-      for (I j = jstart; j <= jend; j++) {
-        const auto ctemp = c(j);
-        const auto stemp = s(j);
-        if ((ctemp != one) || (stemp != zero)) {
-          for (I i = istart + tid; i <= iend; i += i_inc) {
-            const auto temp = A(i, j + 1);
-            A(i, j + 1) = ctemp * temp - stemp * A(i, j);
-            A(i, j) = stemp * temp + ctemp * A(i, j);
-          };
+        //  -----------------------------
+        //  A := P*A
+        //  Variable pivot, the plane (k,k+1)
+        //  P = P(1)*P(2)*...*P(z-1)
+        //  -----------------------------
+
+        auto const jend = (m - 1);
+        auto const jstart = 1;
+        auto const istart = 1;
+        auto const iend = n;
+
+        for(I j = jend; j >= jstart; j--)
+        {
+            const auto ctemp = c(j);
+            const auto stemp = s(j);
+            if((ctemp != one) || (stemp != zero))
+            {
+                for(I i = istart + tid; i <= iend; i += i_inc)
+                {
+                    const auto temp = A(j + 1, i);
+                    A(j + 1, i) = ctemp * temp - stemp * A(j, i);
+                    A(j, i) = stemp * temp + ctemp * A(j, i);
+                };
+            };
         };
-      };
-    }
-
-    return;
-  };
 
-  if (is_side_Right && is_pivot_Variable && is_direct_Backward) {
-    //  -----------------------------
-    //  A := A*P**T
-    //  Variable pivot, the plane (k,k+1)
-    //  P = P(1)*P(2)*...*P(z-1)
-    //  -----------------------------
+        return;
+    };
 
+    if(is_side_Left && is_pivot_Top && is_direct_Forward)
     {
-
-      auto const jend = (n - 1);
-      auto const jstart = 1;
-      auto const istart = 1;
-      auto const iend = m;
-
-      for (I j = jend; j >= jstart; j--) {
-        const auto ctemp = c(j);
-        const auto stemp = s(j);
-        if ((ctemp != one) || (stemp != zero)) {
-
-          for (I i = istart + tid; i <= iend; i += i_inc) {
-            const auto temp = A(i, j + 1);
-            A(i, j + 1) = ctemp * temp - stemp * A(i, j);
-            A(i, j) = stemp * temp + ctemp * A(i, j);
-          };
+        //  -----------------------------
+        //  A := P*A
+        //  Top pivot, the plane (1,k+1)
+        //  P = P(z-1) * ... * P(2) * P(1)
+        //  -----------------------------
+        {
+            for(I j = 2; j <= m; j++)
+            {
+                const auto ctemp = c(j - 1);
+                const auto stemp = s(j - 1);
+                for(I i = 1 + tid; i <= n; i += i_inc)
+                {
+                    const auto temp = A(j, i);
+                    A(j, i) = ctemp * temp - stemp * A(1, i);
+                    A(1, i) = stemp * temp + ctemp * A(1, i);
+                };
+            };
         };
-      };
-    }
-    return;
-  };
 
-  if (is_side_Right && is_pivot_Top && is_direct_Forward) {
-    //  -----------------------------
-    //  A := A*P**T
-    //  Top pivot, the plane (1,k+1)
-    //  P = P(z-1) * ... * P(2) * P(1)
-    //  -----------------------------
+        return;
+    };
 
+    if(is_side_Left && is_pivot_Top && is_direct_Backward)
     {
+        //  -----------------------------
+        //  A := P*A
+        //  Top pivot, the plane (1,k+1)
+        //  P = P(1)*P(2)*...*P(z-1)
+        //  -----------------------------
+        {
+            auto const jend = m;
+            auto const jstart = 2;
+            auto const istart = 1;
+            auto const iend = n;
+
+            for(I j = jend; j >= jstart; j--)
+            {
+                const auto ctemp = c(j - 1);
+                const auto stemp = s(j - 1);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(j, i);
+
+                        A(j, i) = ctemp * temp - stemp * A(1, i);
+                        A(1, i) = stemp * temp + ctemp * A(1, i);
+                    };
+                };
+            };
+        }
 
-      auto const jstart = 2;
-      auto const jend = n;
-      auto const istart = 1;
-      auto const iend = m;
-
-      for (I j = jstart; j <= jend; j++) {
-        const auto ctemp = c(j - 1);
-        const auto stemp = s(j - 1);
-        if ((ctemp != one) || (stemp != zero)) {
-          for (I i = istart + tid; i <= iend; i += i_inc) {
-            const auto temp = A(i, j);
-
-            A(i, j) = ctemp * temp - stemp * A(i, 1);
-            A(i, 1) = stemp * temp + ctemp * A(i, 1);
-          };
-        };
-      };
-    }
+        return;
+    };
 
-    return;
-  };
+    if(is_side_Left && is_pivot_Bottom && is_direct_Forward)
+    {
+        //  -----------------------------
+        //  A := P*A
+        //  Bottom pivot, the plane (k,z)
+        //  P = P(z-1) * ... * P(2) * P(1)
+        //  -----------------------------
+        {
+            auto const jstart = 1;
+            auto const jend = (m - 1);
+            auto const istart = 1;
+            auto const iend = n;
+
+            for(I j = jstart; j <= jend; j++)
+            {
+                const auto ctemp = c(j);
+                const auto stemp = s(j);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(j, i);
+                        A(j, i) = stemp * A(m, i) + ctemp * temp;
+                        A(m, i) = ctemp * A(m, i) - stemp * temp;
+                    };
+                };
+            };
+        }
 
-  if (is_side_Right && is_pivot_Top && is_direct_Backward) {
-    //  -----------------------------
-    //  A := A*P**T
-    //  Top pivot, the plane (1,k+1)
-    //  P = P(1)*P(2)*...*P(z-1)
-    //  -----------------------------
+        return;
+    };
 
+    if(is_side_Left && is_pivot_Bottom && is_direct_Backward)
     {
+        //  -----------------------------
+        //  A := P*A
+        //  Bottom pivot, the plane (k,z)
+        //  P = P(1)*P(2)*...*P(z-1)
+        //  -----------------------------
+        {
+            auto const jend = (m - 1);
+            auto const jstart = 1;
+            auto const istart = 1;
+            auto const iend = n;
+
+            for(I j = jend; j >= jstart; j--)
+            {
+                const auto ctemp = c(j);
+                const auto stemp = s(j);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(j, i);
+                        A(j, i) = stemp * A(m, i) + ctemp * temp;
+                        A(m, i) = ctemp * A(m, i) - stemp * temp;
+                    };
+                };
+            };
+        }
 
-      auto const jend = n;
-      auto const jstart = 2;
-      auto const istart = 1;
-      auto const iend = m;
-
-      for (I j = jend; j >= jstart; j--) {
-        const auto ctemp = c(j - 1);
-        const auto stemp = s(j - 1);
-        if ((ctemp != one) || (stemp != zero)) {
-          for (I i = istart + tid; i <= iend; i += i_inc) {
-            const auto temp = A(i, j);
+        return;
+    };
 
-            A(i, j) = ctemp * temp - stemp * A(i, 1);
-            A(i, 1) = stemp * temp + ctemp * A(i, 1);
-          };
-        };
-      };
-    }
+    if(is_side_Right && is_pivot_Variable && is_direct_Forward)
+    {
+        //  -----------------------------
+        //  A := A*P**T
+        //  Variable pivot, the plane (k,k+1)
+        //  P = P(z-1) * ... * P(2) * P(1)
+        //  -----------------------------
+
+        {
+            auto const jstart = 1;
+            auto const jend = (n - 1);
+            auto const istart = 1;
+            auto const iend = m;
+
+            for(I j = jstart; j <= jend; j++)
+            {
+                const auto ctemp = c(j);
+                const auto stemp = s(j);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(i, j + 1);
+                        A(i, j + 1) = ctemp * temp - stemp * A(i, j);
+                        A(i, j) = stemp * temp + ctemp * A(i, j);
+                    };
+                };
+            };
+        }
 
-    return;
-  };
+        return;
+    };
 
-  if (is_side_Right && is_pivot_Bottom && is_direct_Forward) {
-    //  -----------------------------
-    //  A := A*P**T
-    //  Bottom pivot, the plane (k,z)
-    //  P = P(z-1) * ... * P(2) * P(1)
-    //  -----------------------------
+    if(is_side_Right && is_pivot_Variable && is_direct_Backward)
+    {
+        //  -----------------------------
+        //  A := A*P**T
+        //  Variable pivot, the plane (k,k+1)
+        //  P = P(1)*P(2)*...*P(z-1)
+        //  -----------------------------
+
+        {
+            auto const jend = (n - 1);
+            auto const jstart = 1;
+            auto const istart = 1;
+            auto const iend = m;
+
+            for(I j = jend; j >= jstart; j--)
+            {
+                const auto ctemp = c(j);
+                const auto stemp = s(j);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(i, j + 1);
+                        A(i, j + 1) = ctemp * temp - stemp * A(i, j);
+                        A(i, j) = stemp * temp + ctemp * A(i, j);
+                    };
+                };
+            };
+        }
+        return;
+    };
 
+    if(is_side_Right && is_pivot_Top && is_direct_Forward)
     {
+        //  -----------------------------
+        //  A := A*P**T
+        //  Top pivot, the plane (1,k+1)
+        //  P = P(z-1) * ... * P(2) * P(1)
+        //  -----------------------------
+
+        {
+            auto const jstart = 2;
+            auto const jend = n;
+            auto const istart = 1;
+            auto const iend = m;
+
+            for(I j = jstart; j <= jend; j++)
+            {
+                const auto ctemp = c(j - 1);
+                const auto stemp = s(j - 1);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(i, j);
+
+                        A(i, j) = ctemp * temp - stemp * A(i, 1);
+                        A(i, 1) = stemp * temp + ctemp * A(i, 1);
+                    };
+                };
+            };
+        }
 
-      auto const jstart = 1;
-      auto const jend = (n - 1);
-      auto const istart = 1;
-      auto const iend = m;
+        return;
+    };
 
-      for (I j = jstart; j <= jend; j++) {
-        const auto ctemp = c(j);
-        const auto stemp = s(j);
-        if ((ctemp != one) || (stemp != zero)) {
-          for (I i = istart + tid; i <= iend; i += i_inc) {
-            const auto temp = A(i, j);
+    if(is_side_Right && is_pivot_Top && is_direct_Backward)
+    {
+        //  -----------------------------
+        //  A := A*P**T
+        //  Top pivot, the plane (1,k+1)
+        //  P = P(1)*P(2)*...*P(z-1)
+        //  -----------------------------
+
+        {
+            auto const jend = n;
+            auto const jstart = 2;
+            auto const istart = 1;
+            auto const iend = m;
+
+            for(I j = jend; j >= jstart; j--)
+            {
+                const auto ctemp = c(j - 1);
+                const auto stemp = s(j - 1);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(i, j);
+
+                        A(i, j) = ctemp * temp - stemp * A(i, 1);
+                        A(i, 1) = stemp * temp + ctemp * A(i, 1);
+                    };
+                };
+            };
+        }
 
-            A(i, j) = stemp * A(i, n) + ctemp * temp;
-            A(i, n) = ctemp * A(i, n) - stemp * temp;
-          };
-        };
-      };
-    }
+        return;
+    };
 
-    return;
-  };
+    if(is_side_Right && is_pivot_Bottom && is_direct_Forward)
+    {
+        //  -----------------------------
+        //  A := A*P**T
+        //  Bottom pivot, the plane (k,z)
+        //  P = P(z-1) * ... * P(2) * P(1)
+        //  -----------------------------
+
+        {
+            auto const jstart = 1;
+            auto const jend = (n - 1);
+            auto const istart = 1;
+            auto const iend = m;
+
+            for(I j = jstart; j <= jend; j++)
+            {
+                const auto ctemp = c(j);
+                const auto stemp = s(j);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(i, j);
+
+                        A(i, j) = stemp * A(i, n) + ctemp * temp;
+                        A(i, n) = ctemp * A(i, n) - stemp * temp;
+                    };
+                };
+            };
+        }
 
-  if (is_side_Right && is_pivot_Bottom && is_direct_Backward) {
-    //  -----------------------------
-    //  A := A*P**T
-    //  Bottom pivot, the plane (k,z)
-    //  P = P(1)*P(2)*...*P(z-1)
-    //  -----------------------------
+        return;
+    };
 
+    if(is_side_Right && is_pivot_Bottom && is_direct_Backward)
     {
+        //  -----------------------------
+        //  A := A*P**T
+        //  Bottom pivot, the plane (k,z)
+        //  P = P(1)*P(2)*...*P(z-1)
+        //  -----------------------------
+
+        {
+            auto const jend = (n - 1);
+            auto const jstart = 1;
+            auto const istart = 1;
+            auto const iend = m;
+
+            for(I j = jend; j >= jstart; j--)
+            {
+                const auto ctemp = c(j);
+                const auto stemp = s(j);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(i, j);
+                        A(i, j) = stemp * A(i, n) + ctemp * temp;
+                        A(i, n) = ctemp * A(i, n) - stemp * temp;
+                    };
+                };
+            };
+        }
 
-      auto const jend = (n - 1);
-      auto const jstart = 1;
-      auto const istart = 1;
-      auto const iend = m;
-
-      for (I j = jend; j >= jstart; j--) {
-        const auto ctemp = c(j);
-        const auto stemp = s(j);
-        if ((ctemp != one) || (stemp != zero)) {
-          for (I i = istart + tid; i <= iend; i += i_inc) {
-            const auto temp = A(i, j);
-            A(i, j) = stemp * A(i, n) + ctemp * temp;
-            A(i, n) = ctemp * A(i, n) - stemp * temp;
-          };
-        };
-      };
-    }
+        return;
+    };
 
     return;
-  };
-
-  return;
 }
 template <typename S, typename T, typename I>
-static void lasr_template_gpu(char const side, char const pivot,
-                              char const direct, I const m, I const n,
-                              S const *const c_, S const *const s_, T *const A_,
-                              I const lda, hipStream_t stream = 0) {
-
-  auto const nthreads = 2 * warpSize;
+static void lasr_template_gpu(char const side,
+                              char const pivot,
+                              char const direct,
+                              I const m,
+                              I const n,
+                              S const* const c_,
+                              S const* const s_,
+                              T* const A_,
+                              I const lda,
+                              hipStream_t stream = 0)
+{
+    auto const nthreads = 2 * warpSize;
 
-  bool const is_left_side = (side == 'L') || (side == 'l');
-  auto const mn = (is_left_side) ? n : m;
+    bool const is_left_side = (side == 'L') || (side == 'l');
+    auto const mn = (is_left_side) ? n : m;
 
-  auto const nblocks = (mn - 1) / nthreads + 1;
-  hipLaunchKernelGGL((lasr_kernel<S, T, I>), dim3(nblocks, 1, 1),
-                     dim3(nthreads, 1, 1), 0, stream, side, pivot, direct, m, n,
-                     c_, s_, A_, lda);
+    auto const nblocks = (mn - 1) / nthreads + 1;
+    hipLaunchKernelGGL((lasr_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       side, pivot, direct, m, n, c_, s_, A_, lda);
 }
 
 template <typename S, typename T, typename I>
-__global__ static void rot_kernel(I const n, T *const x, I const incx,
-                                  T *const y, I const incy, S const c,
-                                  S const s) {
-  if (n <= 0)
-    return;
+__global__ static void
+    rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s)
+{
+    if(n <= 0)
+        return;
 
-  I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-  I const i_inc = hipBlockDim_x * hipGridDim_x;
-
-  if ((incx == 1) && (incy == 1)) {
-    // ------------
-    // special case
-    // ------------
-    for (I i = i_start; i < n; i += i_inc) {
-      auto const temp = c * x[i] + s * y[i];
-      y[i] = c * y[i] - s * x[i];
-      x[i] = temp;
-    }
-  } else {
-    // ---------------------------
-    // code for unequal increments
-    // ---------------------------
+    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = hipBlockDim_x * hipGridDim_x;
 
-    for (auto i = i_start; i < n; i += i_inc) {
-      auto const ix = 0 + i * static_cast<int64_t>(incx);
-      auto const iy = 0 + i * static_cast<int64_t>(incy);
-      auto const temp = c * x[ix] + s * y[iy];
-      y[iy] = c * y[iy] - s * x[ix];
-      x[ix] = temp;
+    if((incx == 1) && (incy == 1))
+    {
+        // ------------
+        // special case
+        // ------------
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const temp = c * x[i] + s * y[i];
+            y[i] = c * y[i] - s * x[i];
+            x[i] = temp;
+        }
+    }
+    else
+    {
+        // ---------------------------
+        // code for unequal increments
+        // ---------------------------
+
+        for(auto i = i_start; i < n; i += i_inc)
+        {
+            auto const ix = 0 + i * static_cast<int64_t>(incx);
+            auto const iy = 0 + i * static_cast<int64_t>(incy);
+            auto const temp = c * x[ix] + s * y[iy];
+            y[iy] = c * y[iy] - s * x[ix];
+            x[ix] = temp;
+        }
     }
-  }
 }
 
 template <typename S, typename T, typename I>
-static void rot_template(I const n, T *x, I const incx, T *y, I const incy,
-                         S const c, S const s, hipStream_t stream) {
-  auto nthreads = warpSize * 2;
-  auto nblocks = (n - 1) / nthreads + 1;
-
-  hipLaunchKernelGGL((rot_kernel<S, T, I>), dim3(nblocks, 1, 1),
-                     dim3(nthreads, 1, 1), 0, stream, n, x, incx, y, incy, c,
-                     s);
+static void
+    rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((rot_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, x, incx, y, incy, c, s);
 }
 
 template <typename S, typename T, typename I>
-__global__ static void scal_kernel(I const n, S const da, T *const x,
-                                   I const incx) {
-
-  if (n <= 0)
-    return;
+__global__ static void scal_kernel(I const n, S const da, T* const x, I const incx)
+{
+    if(n <= 0)
+        return;
 
-  I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-  I const i_inc = hipBlockDim_x * hipGridDim_x;
+    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = hipBlockDim_x * hipGridDim_x;
 
-  S const zero = 0;
-  bool const is_da_zero = (da == zero);
-  if (incx == 1) {
-    for (I i = i_start; i < n; i += i_inc) {
-      x[i] = (is_da_zero) ? zero : da * x[i];
+    S const zero = 0;
+    bool const is_da_zero = (da == zero);
+    if(incx == 1)
+    {
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            x[i] = (is_da_zero) ? zero : da * x[i];
+        }
     }
-  } else {
-    // ---------------------------
-    // code for non-unit increments
-    // ---------------------------
-
-    for (I i = i_start; i < n; i += i_inc) {
-      auto const ix = 0 + i * static_cast<int64_t>(incx);
-      x[ix] = (is_da_zero) ? zero : da * x[ix];
+    else
+    {
+        // ---------------------------
+        // code for non-unit increments
+        // ---------------------------
+
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const ix = 0 + i * static_cast<int64_t>(incx);
+            x[ix] = (is_da_zero) ? zero : da * x[ix];
+        }
     }
-  }
 }
 
 template <typename S, typename T, typename I>
-static void scal_template(I const n, S const da, T *const x, I const incx,
-                          hipStream_t stream) {
-  auto nthreads = warpSize * 2;
-  auto nblocks = (n - 1) / nthreads + 1;
+static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
 
-  hipLaunchKernelGGL((scal_kernel<S, T, I>), dim3(nblocks, 1, 1),
-                     dim3(nthreads, 1, 1), 0, stream, n, da, x, incx);
+    hipLaunchKernelGGL((scal_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, da, x, incx);
 }
 
 template <typename S, typename T, typename I>
-__global__ static void swap_kernel(I const n, T *const x, I const incx,
-                                   T *const y, I const incy) {
-  if (n <= 0)
-    return;
-
-  I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-  I const i_inc = hipBlockDim_x * hipGridDim_x;
-
-  if ((incx == 1) && (incy == 1)) {
-    // ------------
-    // special case
-    // ------------
-    for (I i = i_start; i < n; i += i_inc) {
-      auto const temp = y[i];
-      y[i] = x[i];
-      x[i] = temp;
-    }
-  } else {
-    // ---------------------------
-    // code for unequal increments
-    // ---------------------------
+__global__ static void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy)
+{
+    if(n <= 0)
+        return;
 
-    for (I i = i_start; i < n; i += i_inc) {
-      auto const ix = 0 + i * static_cast<int64_t>(incx);
-      auto const iy = 0 + i * static_cast<int64_t>(incy);
+    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = hipBlockDim_x * hipGridDim_x;
 
-      auto const temp = y[iy];
-      y[iy] = x[ix];
-      x[ix] = temp;
+    if((incx == 1) && (incy == 1))
+    {
+        // ------------
+        // special case
+        // ------------
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const temp = y[i];
+            y[i] = x[i];
+            x[i] = temp;
+        }
+    }
+    else
+    {
+        // ---------------------------
+        // code for unequal increments
+        // ---------------------------
+
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const ix = 0 + i * static_cast<int64_t>(incx);
+            auto const iy = 0 + i * static_cast<int64_t>(incy);
+
+            auto const temp = y[iy];
+            y[iy] = x[ix];
+            x[ix] = temp;
+        }
     }
-  }
 }
 
 template <typename S, typename T, typename I>
-static void swap_template(I const n, T *x, I const incx, T *y, I const incy,
-                          hipStream_t stream) {
-  auto nthreads = warpSize * 2;
-  auto nblocks = (n - 1) / nthreads + 1;
+static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
 
-  hipLaunchKernelGGL((swap_kernel<S, T, I>), dim3(nblocks, 1, 1),
-                     dim3(nthreads, 1, 1), 0, stream, n, x, incx, y, incy);
+    hipLaunchKernelGGL((swap_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, x, incx, y, incy);
 }
 
 extern "C" {
 
-double dlamch_(char *cmach);
-float slamch_(char *cmach);
-
-void zswap_(int *n, std::complex<double> *zx, int *incx,
-            std::complex<double> *zy, int *incy);
-
-void cswap_(int *n, std::complex<float> *zx, int *incx, std::complex<float> *zy,
-            int *incy);
+double dlamch_(char* cmach);
+float slamch_(char* cmach);
+
+void zswap_(int* n, std::complex<double>* zx, int* incx, std::complex<double>* zy, int* incy);
+
+void cswap_(int* n, std::complex<float>* zx, int* incx, std::complex<float>* zy, int* incy);
+
+void dswap_(int* n, double* zx, int* incx, double* zy, int* incy);
+
+void sswap_(int* n, float* zx, int* incx, float* zy, int* incy);
+
+void dlasq1_(int* n, double* D_, double* E_, double* rwork_, int* info_arg);
+void slasq1_(int* n, float* D_, float* E_, float* rwork_, int* info_arg);
+
+void zlasr_(char* side,
+            char* pivot,
+            char* direct,
+            int* m,
+            int* n,
+            double* c,
+            double* s,
+            std::complex<double>* A,
+            int* lda);
+void clasr_(char* side,
+            char* pivot,
+            char* direct,
+            int* m,
+            int* n,
+            float* c,
+            float* s,
+            std::complex<float>* A,
+            int* lda);
+void slasr_(char* side, char* pivot, char* direct, int* m, int* n, float* c, float* s, float* A, int* lda);
+void dlasr_(char* side, char* pivot, char* direct, int* m, int* n, double* c, double* s, double* A, int* lda);
+
+void dlasv2_(double* f,
+             double* g,
+             double* h,
+             double* ssmin,
+             double* ssmax,
+             double* snr,
+             double* csr,
+             double* snl,
+             double* csl);
+void slasv2_(float* f,
+             float* g,
+             float* h,
+             float* ssmin,
+             float* ssmax,
+             float* snr,
+             float* csr,
+             float* snl,
+             float* csl);
+
+void zdrot_(int* n,
+            std::complex<double>* zx,
+            int* incx,
+            std::complex<double>* zy,
+            int* incy,
+            double* c,
+            double* s);
+
+void csrot_(int* n,
+            std::complex<float>* zx,
+            int* incx,
+            std::complex<float>* zy,
+            int* incy,
+            float* c,
+            float* s);
+
+void drot_(int* n, double* dx, int* incx, double* dy, int* incy, double* c, double* s);
+
+void srot_(int* n, float* dx, int* incx, float* dy, int* incy, float* c, float* s);
+
+void zdscal_(int* n, double* da, std::complex<double>* zx, int* incx);
+void csscal_(int* n, float* da, std::complex<float>* zx, int* incx);
+void zscal_(int* n, std::complex<double>* za, std::complex<double>* zx, int* incx);
+void cscal_(int* n, std::complex<float>* za, std::complex<float>* zx, int* incx);
+void dscal_(int* n, double* da, double* zx, int* incx);
+void sscal_(int* n, float* da, float* zx, int* incx);
+
+void dlartg_(double* f, double* g, double* c, double* s, double* r);
+void slartg_(float* f, float* g, float* c, float* s, float* r);
+void zlartg_(std::complex<double>* f,
+             std::complex<double>* g,
+             double* c,
+             std::complex<double>* s,
+             std::complex<double>* r);
+void clartg_(std::complex<float>* f,
+             std::complex<float>* g,
+             float* c,
+             std::complex<float>* s,
+             std::complex<float>* r);
+
+void dlas2_(double* f, double* g, double* h, double* ssmin, double* ssmax);
+void slas2_(float* f, float* g, float* h, float* ssmin, float* ssmax);
+};
 
-void dswap_(int *n, double *zx, int *incx, double *zy, int *incy);
+static void call_lamch(char& cmach_arg, double& eps)
+{
+    char cmach = cmach_arg;
+    eps = dlamch_(&cmach);
+}
 
-void sswap_(int *n, float *zx, int *incx, float *zy, int *incy);
+static void call_lamch(char& cmach_arg, float& eps)
+{
+    char cmach = cmach_arg;
+    eps = slamch_(&cmach);
+}
 
-void dlasq1_(int *n, double *D_, double *E_, double *rwork_, int *info_arg);
-void slasq1_(int *n, float *D_, float *E_, float *rwork_, int *info_arg);
+static void call_swap(int& n,
+                      rocblas_complex_num<float>& zx,
+                      int& incx,
+                      rocblas_complex_num<float>& zy,
+                      int& incy)
+{
+    cswap_(&n, (std::complex<float>*)&zx, &incx, (std::complex<float>*)&zy, &incy);
+}
 
-void zlasr_(char *side, char *pivot, char *direct, int *m, int *n, double *c,
-            double *s, std::complex<double> *A, int *lda);
-void clasr_(char *side, char *pivot, char *direct, int *m, int *n, float *c,
-            float *s, std::complex<float> *A, int *lda);
-void slasr_(char *side, char *pivot, char *direct, int *m, int *n, float *c,
-            float *s, float *A, int *lda);
-void dlasr_(char *side, char *pivot, char *direct, int *m, int *n, double *c,
-            double *s, double *A, int *lda);
+static void call_swap(int& n, std::complex<float>& zx, int& incx, std::complex<float>& zy, int& incy)
+{
+    cswap_(&n, &zx, &incx, &zy, &incy);
+}
 
-void dlasv2_(double *f, double *g, double *h, double *ssmin, double *ssmax,
-             double *snr, double *csr, double *snl, double *csl);
-void slasv2_(float *f, float *g, float *h, float *ssmin, float *ssmax,
-             float *snr, float *csr, float *snl, float *csl);
+static void call_swap(int& n,
+                      rocblas_complex_num<double>& zx,
+                      int& incx,
+                      rocblas_complex_num<double>& zy,
+                      int& incy)
+{
+    zswap_(&n, (std::complex<double>*)&zx, &incx, (std::complex<double>*)&zy, &incy);
+}
 
-void zdrot_(int *n, std::complex<double> *zx, int *incx,
-            std::complex<double> *zy, int *incy, double *c, double *s);
+static void call_swap(int& n, std::complex<double>& zx, int& incx, std::complex<double>& zy, int& incy)
+{
+    zswap_(&n, &zx, &incx, &zy, &incy);
+}
 
-void csrot_(int *n, std::complex<float> *zx, int *incx, std::complex<float> *zy,
-            int *incy, float *c, float *s);
+static void call_swap(int& n, float& zx, int& incx, float& zy, int& incy)
+{
+    sswap_(&n, &zx, &incx, &zy, &incy);
+}
 
-void drot_(int *n, double *dx, int *incx, double *dy, int *incy, double *c,
-           double *s);
+static void call_swap(int& n, double& zx, int& incx, double& zy, int& incy)
+{
+    dswap_(&n, &zx, &incx, &zy, &incy);
+}
 
-void srot_(int *n, float *dx, int *incx, float *dy, int *incy, float *c,
-           float *s);
+static void call_las2(double& f, double& g, double& h, double& ssmin, double& ssmax)
+{
+    dlas2_(&f, &g, &h, &ssmin, &ssmax);
+}
 
-void zdscal_(int *n, double *da, std::complex<double> *zx, int *incx);
-void csscal_(int *n, float *da, std::complex<float> *zx, int *incx);
-void dscal_(int *n, double *da, double *zx, int *incx);
-void sscal_(int *n, float *da, float *zx, int *incx);
+static void call_las2(float& f, float& g, float& h, float& ssmin, float& ssmax)
+{
+    slas2_(&f, &g, &h, &ssmin, &ssmax);
+}
 
-void dlartg_(double *f, double *g, double *c, double *s, double *r);
-void slartg_(float *f, float *g, float *c, float *s, float *r);
+static void call_lartg(double& f, double& g, double& c, double& s, double& r)
+{
+    dlartg_(&f, &g, &c, &s, &r);
+}
 
-void dlas2_(double *f, double *g, double *h, double *ssmin, double *ssmax);
-void slas2_(float *f, float *g, float *h, float *ssmin, float *ssmax);
-};
+static void call_lartg(float& f, float& g, float& c, float& s, float& r)
+{
+    slartg_(&f, &g, &c, &s, &r);
+}
 
-static void call_lamch(char &cmach_arg, double &eps) {
-  char cmach = cmach_arg;
-  eps = dlamch_(&cmach);
+static void call_lartg(std::complex<float>& f,
+                       std::complex<float>& g,
+                       float& c,
+                       std::complex<float>& s,
+                       std::complex<float>& r)
+{
+    clartg_(&f, &g, &c, &s, &r);
 }
 
-static void call_lamch(char &cmach_arg, float &eps) {
-  char cmach = cmach_arg;
-  eps = slamch_(&cmach);
+static void call_lartg(std::complex<double>& f,
+                       std::complex<double>& g,
+                       double& c,
+                       std::complex<double>& s,
+                       std::complex<double>& r)
+{
+    zlartg_(&f, &g, &c, &s, &r);
 }
 
-static void call_swap(int &n, std::complex<double> &zx, int &incx,
-                      std::complex<double> &zy, int &incy) {
-  zswap_(&n, &zx, &incx, &zy, &incy);
+static void call_scal(int& n, rocblas_complex_num<float>& da, rocblas_complex_num<float>& zx, int& incx)
+{
+    cscal_(&n, (std::complex<float>*)&da, (std::complex<float>*)&zx, &incx);
 }
 
-static void call_swap(int &n, std::complex<float> &zx, int &incx,
-                      std::complex<float> &zy, int &incy) {
-  cswap_(&n, &zx, &incx, &zy, &incy);
+static void call_scal(int& n, std::complex<float>& da, std::complex<float>& zx, int& incx)
+{
+    cscal_(&n, &da, &zx, &incx);
 }
 
-static void call_swap(int &n, float &zx, int &incx, float &zy, int &incy) {
-  sswap_(&n, &zx, &incx, &zy, &incy);
+static void
+    call_scal(int& n, rocblas_complex_num<double>& da, rocblas_complex_num<double>& zx, int& incx)
+{
+    zscal_(&n, (std::complex<double>*)&da, (std::complex<double>*)&zx, &incx);
 }
 
-static void call_swap(int &n, double &zx, int &incx, double &zy, int &incy) {
-  dswap_(&n, &zx, &incx, &zy, &incy);
+static void call_scal(int& n, std::complex<double>& da, std::complex<double>& zx, int& incx)
+{
+    zscal_(&n, &da, &zx, &incx);
 }
 
-static void call_las2(double &f, double &g, double &h, double &ssmin,
-                      double &ssmax) {
-  dlas2_(&f, &g, &h, &ssmin, &ssmax);
+static void call_scal(int& n, double& da, rocblas_complex_num<double>& zx, int& incx)
+{
+    zdscal_(&n, &da, (std::complex<double>*)&zx, &incx);
 }
 
-static void call_las2(float &f, float &g, float &h, float &ssmin,
-                      float &ssmax) {
-  slas2_(&f, &g, &h, &ssmin, &ssmax);
+static void call_scal(int& n, double& da, std::complex<double>& zx, int& incx)
+{
+    zdscal_(&n, &da, &zx, &incx);
 }
 
-static void call_lartg(double &f, double &g, double &c, double &s, double &r) {
-  dlartg_(&f, &g, &c, &s, &r);
+static void call_scal(int& n, float& da, rocblas_complex_num<float>& zx, int& incx)
+{
+    csscal_(&n, &da, (std::complex<float>*)&zx, &incx);
 }
 
-static void call_lartg(float &f, float &g, float &c, float &s, float &r) {
-  slartg_(&f, &g, &c, &s, &r);
+static void call_scal(int& n, float& da, std::complex<float>& zx, int& incx)
+{
+    csscal_(&n, &da, &zx, &incx);
 }
 
-static void call_scal(int &n, double &da, std::complex<double> &zx, int &incx) {
-  zdscal_(&n, &da, &zx, &incx);
+static void call_scal(int& n, double& da, double& zx, int& incx)
+{
+    dscal_(&n, &da, &zx, &incx);
 }
 
-static void call_scal(int &n, float &da, std::complex<float> &zx, int &incx) {
-  csscal_(&n, &da, &zx, &incx);
+static void call_scal(int& n, float& da, float& zx, int& incx)
+{
+    sscal_(&n, &da, &zx, &incx);
 }
 
-static void call_scal(int &n, double &da, double &zx, int &incx) {
-  dscal_(&n, &da, &zx, &incx);
+static void call_rot(int& n,
+                     std::complex<float>& zx,
+                     int& incx,
+                     std::complex<float>& zy,
+                     int& incy,
+                     float& c,
+                     float& s)
+{
+    csrot_(&n, &zx, &incx, &zy, &incy, &c, &s);
 }
 
-static void call_scal(int &n, float &da, float &zx, int &incx) {
-  sscal_(&n, &da, &zx, &incx);
+static void call_rot(int& n,
+                     rocblas_complex_num<float>& zx,
+                     int& incx,
+                     rocblas_complex_num<float>& zy,
+                     int& incy,
+                     float& c,
+                     float& s)
+{
+    csrot_(&n, (std::complex<float>*)&zx, &incx, (std::complex<float>*)&zy, &incy, &c, &s);
 }
 
-static void call_rot(int &n, std::complex<double> &zx, int &incx,
-                     std::complex<double> &zy, int &incy, double &c,
-                     double &s) {
-  zdrot_(&n, &zx, &incx, &zy, &incy, &c, &s);
+static void call_rot(int& n,
+                     std::complex<double>& zx,
+                     int& incx,
+                     std::complex<double>& zy,
+                     int& incy,
+                     double& c,
+                     double& s)
+{
+    zdrot_(&n, &zx, &incx, &zy, &incy, &c, &s);
 }
 
-static void call_rot(int &n, std::complex<float> &zx, int &incx,
-                     std::complex<float> &zy, int &incy, float &c, float &s) {
-  csrot_(&n, &zx, &incx, &zy, &incy, &c, &s);
+static void call_rot(int& n,
+                     rocblas_complex_num<double>& zx,
+                     int& incx,
+                     rocblas_complex_num<double>& zy,
+                     int& incy,
+                     double& c,
+                     double& s)
+{
+    zdrot_(&n, (std::complex<double>*)&zx, &incx, (std::complex<double>*)&zy, &incy, &c, &s);
 }
 
-static void call_rot(int &n, double &dx, int &incx, double &dy, int &incy,
-                     double &c, double &s) {
-  drot_(&n, &dx, &incx, &dy, &incy, &c, &s);
+static void call_rot(int& n, double& dx, int& incx, double& dy, int& incy, double& c, double& s)
+{
+    drot_(&n, &dx, &incx, &dy, &incy, &c, &s);
 }
 
-static void call_rot(int &n, float &dx, int &incx, float &dy, int &incy,
-                     float &c, float &s) {
-  srot_(&n, &dx, &incx, &dy, &incy, &c, &s);
+static void call_rot(int& n, float& dx, int& incx, float& dy, int& incy, float& c, float& s)
+{
+    srot_(&n, &dx, &incx, &dy, &incy, &c, &s);
 }
 
-static void call_lasv2(double &f, double &g, double &h, double &ssmin,
-                       double &ssmax, double &snr, double &csr, double &snl,
-                       double &csl) {
-  dlasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl);
+static void call_lasv2(double& f,
+                       double& g,
+                       double& h,
+                       double& ssmin,
+                       double& ssmax,
+                       double& snr,
+                       double& csr,
+                       double& snl,
+                       double& csl)
+{
+    dlasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl);
 }
 
-static void call_lasv2(float &f, float &g, float &h, float &ssmin, float &ssmax,
-                       float &snr, float &csr, float &snl, float &csl) {
-  slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl);
+static void call_lasv2(float& f,
+                       float& g,
+                       float& h,
+                       float& ssmin,
+                       float& ssmax,
+                       float& snr,
+                       float& csr,
+                       float& snl,
+                       float& csl)
+{
+    slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl);
 }
 
-static void call_lasq1(int &n, double &D_, double &E_, double &rwork_,
-                       int &info_arg) {
-  dlasq1_(&n, &D_, &E_, &rwork_, &info_arg);
+static void call_lasq1(int& n, double& D_, double& E_, double& rwork_, int& info_arg)
+{
+    dlasq1_(&n, &D_, &E_, &rwork_, &info_arg);
+};
+
+static void call_lasq1(int& n, float& D_, float& E_, float& rwork_, int& info_arg)
+{
+    slasq1_(&n, &D_, &E_, &rwork_, &info_arg);
+};
+
+static void call_lasr(char& side,
+                      char& pivot,
+                      char& direct,
+                      int& m,
+                      int& n,
+                      float& c,
+                      float& s,
+                      rocblas_complex_num<float>& A,
+                      int& lda)
+{
+    clasr_(&side, &pivot, &direct, &m, &n, &c, &s, (std::complex<float>*)&A, &lda);
 };
 
-static void call_lasq1(int &n, float &D_, float &E_, float &rwork_,
-                       int &info_arg) {
-  slasq1_(&n, &D_, &E_, &rwork_, &info_arg);
+static void call_lasr(char& side,
+                      char& pivot,
+                      char& direct,
+                      int& m,
+                      int& n,
+                      double& c,
+                      double& s,
+                      rocblas_complex_num<double>& A,
+                      int& lda)
+{
+    zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, (std::complex<double>*)&A, &lda);
 };
 
-static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n,
-                      double &c, double &s, std::complex<double> &A, int &lda) {
-  zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
+static void call_lasr(char& side,
+                      char& pivot,
+                      char& direct,
+                      int& m,
+                      int& n,
+                      double& c,
+                      double& s,
+                      std::complex<double>& A,
+                      int& lda)
+{
+    zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
 };
 
-static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n,
-                      float &c, float &s, std::complex<float> &A, int &lda) {
-  clasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
+static void call_lasr(char& side,
+                      char& pivot,
+                      char& direct,
+                      int& m,
+                      int& n,
+                      float& c,
+                      float& s,
+                      std::complex<float>& A,
+                      int& lda)
+{
+    clasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
 };
 
-static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n,
-                      float &c, float &s, float &A, int &lda) {
-  slasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
+static void
+    call_lasr(char& side, char& pivot, char& direct, int& m, int& n, float& c, float& s, float& A, int& lda)
+{
+    slasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
 };
 
-static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n,
-                      double &c, double &s, double &A, int &lda) {
-  dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
+static void call_lasr(char& side,
+                      char& pivot,
+                      char& direct,
+                      int& m,
+                      int& n,
+                      double& c,
+                      double& s,
+                      double& A,
+                      int& lda)
+{
+    dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
 };
 
 template <typename S, typename T, typename I>
-static void bdsqr_single_template(char uplo, I n, 
+static void bdsqr_single_template(char uplo,
+                                  I n,
 #ifdef USE_VT
-		I ncvt, 
+                                  I ncvt,
 #else
-		I ncv,
+                                  I ncv,
 #endif
-		I nru, I ncc, T *d_,
-                                  T *e_, 
+                                  I nru,
+                                  I ncc,
+                                  S* d_,
+                                  S* e_,
 #ifdef USE_VT
-				  T *vt_, I ldvt, 
+                                  T* vt_,
+                                  I ldvt,
 #else
-				  T *v_, I ldv, 
+                                  T* v_,
+                                  I ldv,
 #endif
-				  T *u_, I ldu, T *c_,
-                                  int ldc, S *work_, I &info,
-                                  S *dwork = nullptr, hipStream_t stream = 0) {
-
-  bool const use_gpu = (dwork != nullptr);
-
-  S const zero = 0;
-  S const one = 1;
-  S negone = -1;
-  S const hndrd = 100;
-  S const hndrth = one / hndrd;
-  S const ten = 10;
-  S const eight = 8;
-  S const meight = -one / eight;
-  I const maxitr = 6;
-  I ione = 1;
-
-  I nrv = n;
-
-  bool const lower = (uplo == 'L') || (uplo == 'l');
-  bool const upper = (uplo == 'U') || (uplo == 'u');
-  /*
+                                  T* u_,
+                                  I ldu,
+                                  T* c_,
+                                  int ldc,
+                                  S* work_,
+                                  I& info,
+                                  S* dwork = nullptr,
+                                  hipStream_t stream = 0)
+{
+    bool const use_gpu = (dwork != nullptr);
+    bool constexpr need_sort = false;
+
+    S const zero = 0;
+    S const one = 1;
+    S negone = -1;
+    S const hndrd = 100;
+    S const hndrth = one / hndrd;
+    S const ten = 10;
+    S const eight = 8;
+    S const meight = -one / eight;
+    I const maxitr = 6;
+    I ione = 1;
+
+    I nrv = n;
+
+    bool const lower = (uplo == 'L') || (uplo == 'l');
+    bool const upper = (uplo == 'U') || (uplo == 'u');
+    /*
    *     rotate is true if any singular vectors desired, false otherwise
    */
 #ifdef USE_VT
-  bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0);
+    bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0);
 #else
-  bool const rotate = (nrv > 0) || (nru > 0) || (ncc > 0);
+    bool const rotate = (nrv > 0) || (nru > 0) || (ncc > 0);
 #endif
 
-  I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0,
-    m = 0, maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0;
+    I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0,
+      maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0;
 
-  I const nrc = n;  // number of rows in C matrix
-  I const nrvt = n; // number of rows in VT matrix
-  I const ncu = n;  // number of columns in U matrix
+    I const nrc = n; // number of rows in C matrix
+    I const nrvt = n; // number of rows in VT matrix
+    I const ncu = n; // number of columns in U matrix
 
-  S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0,
-    h = 0, mu = 0, oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0,
-    sinl = 0, sinr = 0, sll = 0, smax = 0, smin = 0, sminl = 0, sminoa = 0,
-    sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0;
+    S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, h = 0, mu = 0,
+      oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, sinl = 0, sinr = 0, sll = 0,
+      smax = 0, smin = 0, sminl = 0, sminoa = 0, sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0;
 
-  /*     ..
+    /*     ..
   *     .. external functions ..
         logical            lsame
         double precision   dlamch
@@ -873,210 +1160,217 @@ static void bdsqr_single_template(char uplo, I n,
         intrinsic          abs, dble, max, min, sign, sqrt
    */
 
-  auto call_swap_gpu = [=](I n, T &x, I incx, T &y, I incy) {
-    swap_template<S, T, I>(n, &x, incx, &y, incy, stream);
-  };
+    auto call_swap_gpu = [=](I n, T& x, I incx, T& y, I incy) {
+        swap_template<S, T, I>(n, &x, incx, &y, incy, stream);
+    };
+
+    auto call_rot_gpu = [=](I n, T& x, I incx, T& y, I incy, S cosl, S sinl) {
+        rot_template<S, T, I>(n, &x, incx, &y, incy, cosl, sinl, stream);
+    };
 
-  auto call_rot_gpu = [=](I n, T &x, I incx, T &y, I incy, S cosl, S sinl) {
-    rot_template<S, T, I>(n, &x, incx, &y, incy, cosl, sinl, stream);
-  };
+    auto call_scal_gpu
+        = [=](I n, auto da, T& x, I incx) { scal_template<S, T, I>(n, da, &x, incx, stream); };
+
+    auto call_lasr_gpu
+        = [=](char const side, char const pivot, char const direct, I const m, I const n, S& c,
+              S& s, T& A, I const lda, S* const dwork, hipStream_t stream) {
+              bool const is_left_side = (side == 'L') || (side == 'l');
+              auto const mn = (is_left_side) ? m : n;
+              auto const mn_m1 = (mn - 1);
+              S* const dc = dwork;
+              S* const ds = dwork + mn_m1;
+              HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
+              HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
+
+              lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream);
+              HIP_CHECK(hipStreamSynchronize(stream));
+          };
 
-  auto call_scal_gpu = [=](I n, auto da, T &x, I incx) {
-    scal_template<S, T, I>(n, da, &x, incx, stream);
-  };
+    auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); };
 
-  auto call_lasr_gpu = [=](char const side, char const pivot, char const direct,
-                           I const m, I const n, S &c, S &s, T &A, I const lda,
-                           S *const dwork, hipStream_t stream) {
-    bool const is_left_side = (side == 'L') || (side == 'l');
-    auto const mn = (is_left_side) ? m : n;
-    auto const mn_m1 = (mn - 1);
-    S *const dc = dwork;
-    S *const ds = dwork + mn_m1;
-    HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice,
-                             stream));
-    HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice,
-                             stream));
-
-    lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream);
-    HIP_CHECK(hipStreamSynchronize(stream));
-  };
-
-  auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); };
-
-  auto indx2f = [](auto i, auto j, auto ld) -> int64_t {
-    assert((1 <= i) && (i <= ld));
-    assert((1 <= j));
-    return ((i - 1) + (j - 1) * int64_t(ld));
-  };
-
-  auto d = [=](auto i) -> T & {
-    assert((1 <= i) && (i <= n));
-    return (d_[i - 1]);
-  };
-
-  auto e = [=](auto i) -> T & {
-    assert((1 <= i) && (i <= (n - 1)));
-    return (e_[i - 1]);
-  };
-  auto work = [=](auto i) -> S & { return (work_[i - 1]); };
-
-  auto c = [=](auto i, auto j) -> T & {
-    assert((1 <= i) && (i <= nrc) && (nrc <= ldc));
-    assert((1 <= j) && (j <= ncc));
-    return (c_[indx2f(i, j, ldc)]);
-  };
-
-  auto u = [=](auto i, auto j) -> T & {
-    assert((1 <= i) && (i <= nru) && (nru <= ldu));
-    assert((1 <= j) && (j <= ncu));
-    return (u_[indx2f(i, j, ldu)]);
-  };
+    auto indx2f = [](auto i, auto j, auto ld) -> int64_t {
+        assert((1 <= i) && (i <= ld));
+        assert((1 <= j));
+        return ((i - 1) + (j - 1) * int64_t(ld));
+    };
+
+    auto d = [=](auto i) -> S& {
+        assert((1 <= i) && (i <= n));
+        return (d_[i - 1]);
+    };
+
+    auto e = [=](auto i) -> S& {
+        assert((1 <= i) && (i <= (n - 1)));
+        return (e_[i - 1]);
+    };
+    auto work = [=](auto i) -> S& { return (work_[i - 1]); };
+
+    auto c = [=](auto i, auto j) -> T& {
+        assert((1 <= i) && (i <= nrc) && (nrc <= ldc));
+        assert((1 <= j) && (j <= ncc));
+        return (c_[indx2f(i, j, ldc)]);
+    };
+
+    auto u = [=](auto i, auto j) -> T& {
+        assert((1 <= i) && (i <= nru) && (nru <= ldu));
+        assert((1 <= j) && (j <= ncu));
+        return (u_[indx2f(i, j, ldu)]);
+    };
 
 #ifdef USE_VT
-  auto vt = [=](auto i, auto j) -> T & {
-    assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt));
-    assert((1 <= j) && (j <= ncvt));
-    return (vt_[indx2f(i, j, ldvt)]);
-  };
+    auto vt = [=](auto i, auto j) -> T& {
+        assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt));
+        assert((1 <= j) && (j <= ncvt));
+        return (vt_[indx2f(i, j, ldvt)]);
+    };
 #else
-  auto v = [=](auto i, auto j) -> T & {
-    assert((1 <= i) && (i <= nrv) && (nrv <= ldv));
-    assert((1 <= j) && (j <= ncv));
-    return (v_[indx2f(i, j, ldv)]);
-  };
+    auto v = [=](auto i, auto j) -> T& {
+        assert((1 <= i) && (i <= nrv) && (nrv <= ldv));
+        assert((1 <= j) && (j <= ncv));
+        return (v_[indx2f(i, j, ldv)]);
+    };
 #endif
 
-  // ---------------------------
-  // emulate Fortran  intrinsics
-  // ---------------------------
-  auto sign = [](auto a, auto b) {
-    auto const abs_a = std::abs(a);
-    return ((b >= 0) ? abs_a : -abs_a);
-  };
+    // ---------------------------
+    // emulate Fortran  intrinsics
+    // ---------------------------
+    auto sign = [](auto a, auto b) {
+        auto const abs_a = std::abs(a);
+        return ((b >= 0) ? abs_a : -abs_a);
+    };
 
-  auto dble = [](auto x) { return (static_cast<double>(x)); };
+    auto dble = [](auto x) { return (static_cast<double>(x)); };
 
-  auto max = [](auto a, auto b) { return ((a > b) ? a : b); };
+    auto max = [](auto a, auto b) { return ((a > b) ? a : b); };
 
-  auto min = [](auto a, auto b) { return ((a < b) ? a : b); };
+    auto min = [](auto a, auto b) { return ((a < b) ? a : b); };
 
-  auto sqrt = [](auto x) { return (std::sqrt(x)); };
+    auto sqrt = [](auto x) { return (std::sqrt(x)); };
 
-  /*     ..
+    /*     ..
    *     .. executable statements ..
    *
    *     test the input parameters.
    *
    */
 
-  info = (!upper) && (!lower) ? -1
-         : (n < 0)            ? -2
+    info = (!upper) && (!lower) ? -1
+        : (n < 0)               ? -2
 #ifdef USE_VT
-         : (ncvt < 0)         ? -3
+        : (ncvt < 0) ? -3
 #else
-         : (nrv < 0)         ? -3
+        : (nrv < 0) ? -3
 #endif
-         : (nru < 0)          ? -4
-         : (ncc < 0)          ? -5
-	 : (ldu < max(1, nru)) ? -11
-	 : 0;
-
-  if (info != 0)
-    return;
-
-  if (n == 0)
-    return;
-  if (n == 1)
-    goto L160;
-  /*
+        : (nru < 0)           ? -4
+        : (ncc < 0)           ? -5
+        : (ldu < max(1, nru)) ? -11
+                              : 0;
+
+    if(info != 0)
+        return;
+
+    if(n == 0)
+        return;
+    if(n == 1)
+        goto L160;
+    /*
    *     if no singular vectors desired, use qd algorithm
    */
-  if (!rotate) {
-    call_lasq1(n, d(1), e(1), work(1), info);
-    /*
+    if(!rotate)
+    {
+        call_lasq1(n, d(1), e(1), work(1), info);
+        /*
      *     if info equals 2, dqds didn't finish, try to finish
      */
-    if (info != 2)
-      return;
-    info = 0;
-  }
+        if(info != 2)
+            return;
+        info = 0;
+    }
 
-  nm1 = n - 1;
-  nm12 = nm1 + nm1;
-  nm13 = nm12 + nm1;
-  idir = 0;
-  /*
+    nm1 = n - 1;
+    nm12 = nm1 + nm1;
+    nm13 = nm12 + nm1;
+    idir = 0;
+    /*
    *     get machine constants
    *
    */
-  {
-    char cmach_eps = 'E';
-    char cmach_unfl = 'S';
-    call_lamch(cmach_eps, eps);
-    call_lamch(cmach_unfl, unfl);
-  }
-  /*
+    {
+        char cmach_eps = 'E';
+        char cmach_unfl = 'S';
+        call_lamch(cmach_eps, eps);
+        call_lamch(cmach_unfl, unfl);
+    }
+    /*
    *     if matrix lower bidiagonal, rotate to be upper bidiagonal
    *     by applying givens rotations on the left
    */
 
-  if (lower) {
-    // do 10 i = 1, n - 1
-    for (i = 1; i <= (n - 1); i++) {
-      call_lartg(d(i), e(i), cs, sn, r);
-      d(i) = r;
-      e(i) = sn * d(i + 1);
-      d(i + 1) = cs * d(i + 1);
-      work(i) = cs;
-      work(nm1 + i) = sn;
-    }
-  L10:
+    if(lower)
+    {
+        // do 10 i = 1, n - 1
+        for(i = 1; i <= (n - 1); i++)
+        {
+            call_lartg(d(i), e(i), cs, sn, r);
+            d(i) = r;
+            e(i) = sn * d(i + 1);
+            d(i + 1) = cs * d(i + 1);
+            work(i) = cs;
+            work(nm1 + i) = sn;
+        }
+    L10:
 
-    /*
+        /*
      *        update singular vectors if desired
      */
 
-    if (nru > 0) {
-      // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu );
-      char side = 'R';
-      char pivot = 'V';
-      char direct = 'F';
-      if (use_gpu) {
-
-        call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1),
-                      ldu, dwork, stream);
-      } else {
-        call_lasr(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu);
-      }
-    }
-    if (ncc > 0) {
-      // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc );
-      char side = 'L';
-      char pivot = 'V';
-      char direct = 'F';
-      if (use_gpu) {
-        call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1),
-                      ldc, dwork, stream);
-      } else {
-        call_lasr(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc);
-      }
+        if(nru > 0)
+        {
+            // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu );
+            char side = 'R';
+            char pivot = 'V';
+            char direct = 'F';
+            if(use_gpu)
+            {
+                call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu, dwork,
+                              stream);
+            }
+            else
+            {
+                call_lasr(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu);
+            }
+        }
+        if(ncc > 0)
+        {
+            // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc );
+            char side = 'L';
+            char pivot = 'V';
+            char direct = 'F';
+            if(use_gpu)
+            {
+                call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc, dwork,
+                              stream);
+            }
+            else
+            {
+                call_lasr(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc);
+            }
+        }
     }
-  }
-  /*
+    /*
    *     compute singular values to relative accuracy tol
    *     (by setting tol to be negative, algorithm will compute
    *     singular values to absolute accuracy abs(tol)*norm(input matrix))
    */
 
-  tolmul = max(ten, min(hndrd, pow(eps, meight)));
-  tol = tolmul * eps;
+    tolmul = max(ten, min(hndrd, pow(eps, meight)));
+    tol = tolmul * eps;
 
-  /*
+    /*
    *     compute approximate maximum, minimum singular values
    */
 
-  /*
+    /*
         smax = zero
         do 20 i = 1, n
            smax = max( smax, abs( d( i ) ) )
@@ -1085,993 +1379,1151 @@ static void bdsqr_single_template(char uplo, I n,
            smax = max( smax, abs( e( i ) ) )
      L30:
   */
-  smax = zero;
-  // do 20 i = 1, n
-  for (i = 1; i <= n; i++) {
-    smax = max(smax, abs(d(i)));
-  }
+    smax = zero;
+    // do 20 i = 1, n
+    for(i = 1; i <= n; i++)
+    {
+        smax = max(smax, abs(d(i)));
+    }
 L20:
-  // do 30 i = 1, n - 1
-  for (i = 1; i <= (n - 1); i++) {
-    smax = max(smax, abs(e(i)));
-  }
+    // do 30 i = 1, n - 1
+    for(i = 1; i <= (n - 1); i++)
+    {
+        smax = max(smax, abs(e(i)));
+    }
 L30:
 
-  sminl = zero;
-  if (tol >= zero) {
-    /*
+    sminl = zero;
+    if(tol >= zero)
+    {
+        /*
      *        relative accuracy desired
      */
 
-    sminoa = abs(d(1));
-    if (sminoa == zero)
-      goto L50;
-    mu = sminoa;
-    // do 40 i = 2, n
-    for (i = 2; i <= n; i++) {
-      mu = abs(d(i)) * (mu / (mu + abs(e(i - 1))));
-      sminoa = min(sminoa, mu);
-      if (sminoa == zero)
-        goto L50;
-    }
-  L40:
-  L50:
+        sminoa = abs(d(1));
+        if(sminoa == zero)
+            goto L50;
+        mu = sminoa;
+        // do 40 i = 2, n
+        for(i = 2; i <= n; i++)
+        {
+            mu = abs(d(i)) * (mu / (mu + abs(e(i - 1))));
+            sminoa = min(sminoa, mu);
+            if(sminoa == zero)
+                goto L50;
+        }
+    L40:
+    L50:
 
-    sminoa = sminoa / sqrt(dble(n));
-    thresh = max(tol * sminoa, maxitr * (n * (n * unfl)));
-  } else {
-    /*
+        sminoa = sminoa / sqrt(dble(n));
+        thresh = max(tol * sminoa, maxitr * (n * (n * unfl)));
+    }
+    else
+    {
+        /*
      *        absolute accuracy desired
      */
 
-    thresh = max(abs(tol) * smax, maxitr * (n * (n * unfl)));
-  }
-  /*
+        thresh = max(abs(tol) * smax, maxitr * (n * (n * unfl)));
+    }
+    /*
    *     prepare for main iteration loop for the singular values
    *     (maxit is the maximum number of passes through the inner
    *     loop permitted before nonconvergence signalled.)
    */
-  maxitdivn = maxitr * n;
-  iterdivn = 0;
-  iter = -1;
-  oldll = -1;
-  oldm = -1;
-  /*
+    maxitdivn = maxitr * n;
+    iterdivn = 0;
+    iter = -1;
+    oldll = -1;
+    oldm = -1;
+    /*
    *     m points to last element of unconverged part of matrix
    */
-  m = n;
-  /*
+    m = n;
+    /*
    *     begin main iteration loop
    */
 L60:
-  /*
+    /*
    *     check for convergence or exceeding iteration count
    */
-  if (m <= 1)
-    goto L160;
-
-  if (iter >= n) {
-    iter = iter - n;
-    iterdivn = iterdivn + 1;
-    if (iterdivn >= maxitdivn)
-      goto L200;
-  }
-  /*
+    if(m <= 1)
+        goto L160;
+
+    if(iter >= n)
+    {
+        iter = iter - n;
+        iterdivn = iterdivn + 1;
+        if(iterdivn >= maxitdivn)
+            goto L200;
+    }
+    /*
    *     find diagonal block of matrix to work on
    */
-  if (tol < zero && abs(d(m)) <= thresh)
-    d(m) = zero;
-
-  smax = abs(d(m));
-  smin = smax;
-  // do 70 lll = 1, m - 1
-  for (lll = 1; lll <= (m - 1); lll++) {
-    ll = m - lll;
-    abss = abs(d(ll));
-    abse = abs(e(ll));
-    if (tol < zero && abss <= thresh)
-      d(ll) = zero;
-    if (abse <= thresh)
-      goto L80;
-    smin = min(smin, abss);
-    smax = max(smax, max(abss, abse));
-  }
+    if(tol < zero && abs(d(m)) <= thresh)
+        d(m) = zero;
+
+    smax = abs(d(m));
+    smin = smax;
+    // do 70 lll = 1, m - 1
+    for(lll = 1; lll <= (m - 1); lll++)
+    {
+        ll = m - lll;
+        abss = abs(d(ll));
+        abse = abs(e(ll));
+        if(tol < zero && abss <= thresh)
+            d(ll) = zero;
+        if(abse <= thresh)
+            goto L80;
+        smin = min(smin, abss);
+        smax = max(smax, max(abss, abse));
+    }
 L70:
-  ll = 0;
-  goto L90;
+    ll = 0;
+    goto L90;
 L80:
-  e(ll) = zero;
-  /*
+    e(ll) = zero;
+    /*
    *     matrix splits since e(ll) = 0
    */
-  if (ll == m - 1) {
-    /*
+    if(ll == m - 1)
+    {
+        /*
      *        convergence of bottom singular value, return to top of loop
      */
-    m = m - 1;
-    goto L60;
-  }
+        m = m - 1;
+        goto L60;
+    }
 L90:
-  ll = ll + 1;
-  /*
+    ll = ll + 1;
+    /*
    *     e(ll) through e(m-1) are nonzero, e(ll-1) is zero
    */
-  if (ll == m - 1) {
-    /*
+    if(ll == m - 1)
+    {
+        /*
      *        2 by 2 block, handle separately
      */
-    call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl);
-    d(m - 1) = sigmx;
-    e(m - 1) = zero;
-    d(m) = sigmn;
-    /*
+        call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl);
+        d(m - 1) = sigmx;
+        e(m - 1) = zero;
+        d(m) = sigmn;
+        /*
      *        compute singular vectors, if desired
      */
 #ifdef USE_VT
-    if (ncvt > 0) {
-      if (use_gpu) {
-        call_rot_gpu(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr);
-      } else {
-        call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr);
-      }
-    }
+        if(ncvt > 0)
+        {
+            if(use_gpu)
+            {
+                call_rot_gpu(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr);
+            }
+            else
+            {
+                call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr);
+            }
+        }
 #else
-    if (nrv > 0) {
-      if (use_gpu) {
-        call_rot_gpu(nrv, v(1,m - 1), 1, v(1,m), 1, cosr, sinr);
-      } else {
-        call_rot(nrv, v(1,m - 1), 1, v(1,m), 1, cosr, sinr);
-      }
-    }
+        if(nrv > 0)
+        {
+            if(use_gpu)
+            {
+                call_rot_gpu(nrv, v(1, m - 1), 1, v(1, m), 1, cosr, sinr);
+            }
+            else
+            {
+                call_rot(nrv, v(1, m - 1), 1, v(1, m), 1, cosr, sinr);
+            }
+        }
 #endif
-    if (nru > 0) {
-      if (use_gpu) {
-        call_rot_gpu(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl);
-      } else {
-        call_rot(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl);
-      }
-    }
-    if (ncc > 0) {
-      if (use_gpu) {
-        call_rot_gpu(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl);
-      } else {
-        call_rot(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl);
-      }
+        if(nru > 0)
+        {
+            if(use_gpu)
+            {
+                call_rot_gpu(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl);
+            }
+            else
+            {
+                call_rot(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl);
+            }
+        }
+        if(ncc > 0)
+        {
+            if(use_gpu)
+            {
+                call_rot_gpu(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl);
+            }
+            else
+            {
+                call_rot(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl);
+            }
+        }
+        m = m - 2;
+        goto L60;
     }
-    m = m - 2;
-    goto L60;
-  }
-  /*
+    /*
    *     if working on new submatrix, choose shift direction
    *     (from larger end diagonal element towards smaller)
    */
-  if (ll > oldm || m < oldll) {
-    if (abs(d(ll)) >= abs(d(m))) {
-      /*
+    if(ll > oldm || m < oldll)
+    {
+        if(abs(d(ll)) >= abs(d(m)))
+        {
+            /*
        *           chase bulge from top (big end) to bottom (small end)
        */
-      idir = 1;
-    } else {
-      /*
+            idir = 1;
+        }
+        else
+        {
+            /*
        *           chase bulge from bottom (big end) to top (small end)
        */
-      idir = 2;
+            idir = 2;
+        }
     }
-  }
-  /*
+    /*
    *     apply convergence tests
    */
-  if (idir == 1) {
-    /*
+    if(idir == 1)
+    {
+        /*
      *        run convergence test in forward direction
      *        first apply standard test to bottom of matrix
      */
-    if (abs(e(m - 1)) <= abs(tol) * abs(d(m)) ||
-        (tol < zero && abs(e(m - 1)) <= thresh)) {
-      e(m - 1) = zero;
-      goto L60;
-    }
+        if(abs(e(m - 1)) <= abs(tol) * abs(d(m)) || (tol < zero && abs(e(m - 1)) <= thresh))
+        {
+            e(m - 1) = zero;
+            goto L60;
+        }
 
-    if (tol >= zero) {
-      /*
+        if(tol >= zero)
+        {
+            /*
        *           if relative accuracy desired,
        *           apply convergence criterion forward
        */
-      mu = abs(d(ll));
-      sminl = mu;
-      // do 100 lll = ll, m - 1
-      for (lll = ll; lll <= (m - 1); lll++) {
-        if (abs(e(lll)) <= tol * mu) {
-          e(lll) = zero;
-          goto L60;
+            mu = abs(d(ll));
+            sminl = mu;
+            // do 100 lll = ll, m - 1
+            for(lll = ll; lll <= (m - 1); lll++)
+            {
+                if(abs(e(lll)) <= tol * mu)
+                {
+                    e(lll) = zero;
+                    goto L60;
+                }
+                mu = abs(d(lll + 1)) * (mu / (mu + abs(e(lll))));
+                sminl = min(sminl, mu);
+            }
+            // L100:
         }
-        mu = abs(d(lll + 1)) * (mu / (mu + abs(e(lll))));
-        sminl = min(sminl, mu);
-      }
-      // L100:
     }
-
-  } else {
-    /*
+    else
+    {
+        /*
      *        run convergence test in backward direction
      *        first apply standard test to top of matrix
      */
-    if (abs(e(ll)) <= abs(tol) * abs(d(ll)) ||
-        (tol < zero && abs(e(ll)) <= thresh)) {
-      e(ll) = zero;
-      goto L60;
-    }
+        if(abs(e(ll)) <= abs(tol) * abs(d(ll)) || (tol < zero && abs(e(ll)) <= thresh))
+        {
+            e(ll) = zero;
+            goto L60;
+        }
 
-    if (tol >= zero) {
-      /*
+        if(tol >= zero)
+        {
+            /*
        *           if relative accuracy desired,
        *           apply convergence criterion backward
        */
-      mu = abs(d(m));
-      sminl = mu;
-      // do 110 lll = m - 1, ll, -1
-      for (lll = (m - 1); lll >= ll; lll--) {
-        if (abs(e(lll)) <= tol * mu) {
-          e(lll) = zero;
-          goto L60;
+            mu = abs(d(m));
+            sminl = mu;
+            // do 110 lll = m - 1, ll, -1
+            for(lll = (m - 1); lll >= ll; lll--)
+            {
+                if(abs(e(lll)) <= tol * mu)
+                {
+                    e(lll) = zero;
+                    goto L60;
+                }
+                mu = abs(d(lll)) * (mu / (mu + abs(e(lll))));
+                sminl = min(sminl, mu);
+            }
+            // L110:
         }
-        mu = abs(d(lll)) * (mu / (mu + abs(e(lll))));
-        sminl = min(sminl, mu);
-      }
-      // L110:
     }
-  }
-  oldll = ll;
-  oldm = m;
-  /*
+    oldll = ll;
+    oldm = m;
+    /*
    *     compute shift.  first, test if shifting would ruin relative
    *     accuracy, and if so set the shift to zero.
    */
-  if (tol >= zero && n * tol * (sminl / smax) <= max(eps, hndrth * tol)) {
-    /*
+    if(tol >= zero && n * tol * (sminl / smax) <= max(eps, hndrth * tol))
+    {
+        /*
      *        use a zero shift to avoid loss of relative accuracy
      */
-    shift = zero;
-  } else {
-    /*
+        shift = zero;
+    }
+    else
+    {
+        /*
      *        compute the shift from 2-by-2 block at end of matrix
      */
-    if (idir == 1) {
-      sll = abs(d(ll));
-      call_las2(d(m - 1), e(m - 1), d(m), shift, r);
-    } else {
-      sll = abs(d(m));
-      call_las2(d(ll), e(ll), d(ll + 1), shift, r);
-    }
-    /*
+        if(idir == 1)
+        {
+            sll = abs(d(ll));
+            call_las2(d(m - 1), e(m - 1), d(m), shift, r);
+        }
+        else
+        {
+            sll = abs(d(m));
+            call_las2(d(ll), e(ll), d(ll + 1), shift, r);
+        }
+        /*
      *        test if shift negligible, and if so set to zero
      */
-    if (sll > zero) {
-      if ((shift / sll) * (shift / sll) < eps)
-        shift = zero;
+        if(sll > zero)
+        {
+            if((shift / sll) * (shift / sll) < eps)
+                shift = zero;
+        }
     }
-  }
-  /*
+    /*
    *     increment iteration count
    */
-  iter = iter + m - ll;
-  /*
+    iter = iter + m - ll;
+    /*
    *     if shift = 0, do simplified qr iteration
    */
-  if (shift == zero) {
-    if (idir == 1) {
-      /*
+    if(shift == zero)
+    {
+        if(idir == 1)
+        {
+            /*
        *           chase bulge from top to bottom
        *           save cosines and sines for later singular vector updates
        */
-      cs = one;
-      oldcs = one;
-      // do 120 i = ll, m - 1
-      for (i = ll; i <= (m - 1); i++) {
-        auto di_cs = d(i) * cs;
-        call_lartg(di_cs, e(i), cs, sn, r);
-        if (i > ll)
-          e(i - 1) = oldsn * r;
-        auto oldcs_r = oldcs * r;
-        auto dip1_sn = d(i + 1) * sn;
-        call_lartg(oldcs_r, dip1_sn, oldcs, oldsn, d(i));
-        work(i - ll + 1) = cs;
-        work(i - ll + 1 + nm1) = sn;
-        work(i - ll + 1 + nm12) = oldcs;
-        work(i - ll + 1 + nm13) = oldsn;
-      }
-    L120:
-      h = d(m) * cs;
-      d(m) = h * oldcs;
-      e(m - 1) = h * oldsn;
-      /*
+            cs = one;
+            oldcs = one;
+            // do 120 i = ll, m - 1
+            for(i = ll; i <= (m - 1); i++)
+            {
+                auto di_cs = d(i) * cs;
+                call_lartg(di_cs, e(i), cs, sn, r);
+                if(i > ll)
+                    e(i - 1) = oldsn * r;
+                auto oldcs_r = oldcs * r;
+                auto dip1_sn = d(i + 1) * sn;
+                call_lartg(oldcs_r, dip1_sn, oldcs, oldsn, d(i));
+                work(i - ll + 1) = cs;
+                work(i - ll + 1 + nm1) = sn;
+                work(i - ll + 1 + nm12) = oldcs;
+                work(i - ll + 1 + nm13) = oldsn;
+            }
+        L120:
+            h = d(m) * cs;
+            d(m) = h * oldcs;
+            e(m - 1) = h * oldsn;
+            /*
        *           update singular vectors
        */
 #ifdef USE_VT
-      if (ncvt > 0) {
-        // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
-        // ll, 1 ), ldvt )
-        char side = 'L';
-        char pivot = 'V';
-        char direct = 'F';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n),
-                        vt(ll, 1), ldvt, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1),
-                    ldvt);
-        }
-      }
+            if(ncvt > 0)
+            {
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
+                // ll, 1 ), ldvt )
+                char side = 'L';
+                char pivot = 'V';
+                char direct = 'F';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt,
+                                  dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt);
+                }
+            }
 #else
 
-      if (nrv > 0) {
-        // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), 
-	//            v(1,ll ), ldv)
-        char side = 'R';
-        char pivot = 'V';
-        char direct = 'F';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n),
-                        v(1,ll), ldv, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1,ll),
-                    ldv);
-        }
-      }
+            if(nrv > 0)
+            {
+                // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ),
+                //            v(1,ll ), ldv)
+                char side = 'R';
+                char pivot = 'V';
+                char direct = 'F';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv,
+                                  dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv);
+                }
+            }
 #endif
-      if (nru > 0) {
-        // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
-        // ), u( 1, ll ), ldu )
-        char side = 'R';
-        char pivot = 'V';
-        char direct = 'F';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1),
-                        work(nm13 + 1), u(1, ll), ldu, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1),
-                    work(nm13 + 1), u(1, ll), ldu);
-        }
-      }
-      if (ncc > 0) {
-        // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
-        // ), c( ll, 1 ), ldc )
-        char side = 'L';
-        char pivot = 'V';
-        char direct = 'F';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1),
-                        work(nm13 + 1), c(ll, 1), ldc, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1),
-                    work(nm13 + 1), c(ll, 1), ldc);
-        }
-      }
-      /*
+            if(nru > 0)
+            {
+                // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
+                // ), u( 1, ll ), ldu )
+                char side = 'R';
+                char pivot = 'V';
+                char direct = 'F';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                                  u(1, ll), ldu, dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                              u(1, ll), ldu);
+                }
+            }
+            if(ncc > 0)
+            {
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
+                // ), c( ll, 1 ), ldc )
+                char side = 'L';
+                char pivot = 'V';
+                char direct = 'F';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                                  c(ll, 1), ldc, dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                              c(ll, 1), ldc);
+                }
+            }
+            /*
        *           test convergence
        */
-      if (abs(e(m - 1)) <= thresh)
-        e(m - 1) = zero;
-
-    } else {
-      /*
+            if(abs(e(m - 1)) <= thresh)
+                e(m - 1) = zero;
+        }
+        else
+        {
+            /*
        *           chase bulge from bottom to top
        *           save cosines and sines for later singular vector updates
        */
-      cs = one;
-      oldcs = one;
-      // do 130 i = m, ll + 1, -1
-      for (i = m; i >= (ll + 1); i--) {
-        auto di_cs = d(i) * cs;
-        call_lartg(di_cs, e(i - 1), cs, sn, r);
-
-        if (i < m)
-          e(i) = oldsn * r;
-
-        auto oldcs_r = oldcs * r;
-        auto dim1_sn = d(i - 1) * sn;
-        call_lartg(oldcs_r, dim1_sn, oldcs, oldsn, d(i));
-
-        work(i - ll) = cs;
-        work(i - ll + nm1) = -sn;
-        work(i - ll + nm12) = oldcs;
-        work(i - ll + nm13) = -oldsn;
-      }
-    L130:
-      h = d(ll) * cs;
-      d(ll) = h * oldcs;
-      e(ll) = h * oldsn;
-      /*
+            cs = one;
+            oldcs = one;
+            // do 130 i = m, ll + 1, -1
+            for(i = m; i >= (ll + 1); i--)
+            {
+                auto di_cs = d(i) * cs;
+                call_lartg(di_cs, e(i - 1), cs, sn, r);
+
+                if(i < m)
+                    e(i) = oldsn * r;
+
+                auto oldcs_r = oldcs * r;
+                auto dim1_sn = d(i - 1) * sn;
+                call_lartg(oldcs_r, dim1_sn, oldcs, oldsn, d(i));
+
+                work(i - ll) = cs;
+                work(i - ll + nm1) = -sn;
+                work(i - ll + nm12) = oldcs;
+                work(i - ll + nm13) = -oldsn;
+            }
+        L130:
+            h = d(ll) * cs;
+            d(ll) = h * oldcs;
+            e(ll) = h * oldsn;
+            /*
        *           update singular vectors
        */
 #ifdef USE_VT
-      if (ncvt > 0) {
-        // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
-        // nm13+1
-        // ), vt( ll, 1 ), ldvt );
-        char side = 'L';
-        char pivot = 'V';
-        char direct = 'B';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1),
-                        work(nm13 + 1), vt(ll, 1), ldvt, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1),
-                    work(nm13 + 1), vt(ll, 1), ldvt);
-        }
-      }
+            if(ncvt > 0)
+            {
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
+                // nm13+1
+                // ), vt( ll, 1 ), ldvt );
+                char side = 'L';
+                char pivot = 'V';
+                char direct = 'B';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                                  vt(ll, 1), ldvt, dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                              vt(ll, 1), ldvt);
+                }
+            }
 #else
 
-      if (nrv > 0) {
-        // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv );
-        char side = 'R';
-        char pivot = 'V';
-        char direct = 'B';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1),
-                        work(nm13 + 1), v(1,ll), ldv, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1),
-                    work(nm13 + 1), v(1,ll), ldv);
-        }
-      }
-
+            if(nrv > 0)
+            {
+                // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv );
+                char side = 'R';
+                char pivot = 'V';
+                char direct = 'B';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1),
+                                  v(1, ll), ldv, dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1),
+                              v(1, ll), ldv);
+                }
+            }
 
 #endif
 
-      if (nru > 0) {
-        // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
-        // ll
-        // ), ldu )
-        char side = 'R';
-        char pivot = 'V';
-        char direct = 'B';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n),
-                        u(1, ll), ldu, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll),
-                    ldu);
-        }
-      }
-      if (ncc > 0) {
-        // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
-        // 1
-        // ), ldc )
-        char side = 'L';
-        char pivot = 'V';
-        char direct = 'B';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n),
-                        c(ll, 1), ldc, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1),
-                    ldc);
-        }
-      }
-      /*
+            if(nru > 0)
+            {
+                // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
+                // ll
+                // ), ldu )
+                char side = 'R';
+                char pivot = 'V';
+                char direct = 'B';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu,
+                                  dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu);
+                }
+            }
+            if(ncc > 0)
+            {
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
+                // 1
+                // ), ldc )
+                char side = 'L';
+                char pivot = 'V';
+                char direct = 'B';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc,
+                                  dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc);
+                }
+            }
+            /*
        *           test convergence
        */
-      if (abs(e(ll)) <= thresh)
-        e(ll) = zero;
+            if(abs(e(ll)) <= thresh)
+                e(ll) = zero;
+        }
     }
-  } else {
-    /*
+    else
+    {
+        /*
      *        use nonzero shift
      */
-    if (idir == 1) {
-      /*
+        if(idir == 1)
+        {
+            /*
        *           chase bulge from top to bottom
        *           save cosines and sines for later singular vector updates
        */
-      f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll));
-      g = e(ll);
-      // do 140 i = ll, m - 1
-      for (i = ll; i <= (m - 1); i++) {
-        call_lartg(f, g, cosr, sinr, r);
-        if (i > ll)
-          e(i - 1) = r;
-        f = cosr * d(i) + sinr * e(i);
-        e(i) = cosr * e(i) - sinr * d(i);
-        g = sinr * d(i + 1);
-        d(i + 1) = cosr * d(i + 1);
-        call_lartg(f, g, cosl, sinl, r);
-        d(i) = r;
-        f = cosl * e(i) + sinl * d(i + 1);
-        d(i + 1) = cosl * d(i + 1) - sinl * e(i);
-        if (i < m - 1) {
-          g = sinl * e(i + 1);
-          e(i + 1) = cosl * e(i + 1);
-        }
-        work(i - ll + 1) = cosr;
-        work(i - ll + 1 + nm1) = sinr;
-        work(i - ll + 1 + nm12) = cosl;
-        work(i - ll + 1 + nm13) = sinl;
-      }
-    L140:
-      e(m - 1) = f;
-      /*
+            f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll));
+            g = e(ll);
+            // do 140 i = ll, m - 1
+            for(i = ll; i <= (m - 1); i++)
+            {
+                call_lartg(f, g, cosr, sinr, r);
+                if(i > ll)
+                    e(i - 1) = r;
+                f = cosr * d(i) + sinr * e(i);
+                e(i) = cosr * e(i) - sinr * d(i);
+                g = sinr * d(i + 1);
+                d(i + 1) = cosr * d(i + 1);
+                call_lartg(f, g, cosl, sinl, r);
+                d(i) = r;
+                f = cosl * e(i) + sinl * d(i + 1);
+                d(i + 1) = cosl * d(i + 1) - sinl * e(i);
+                if(i < m - 1)
+                {
+                    g = sinl * e(i + 1);
+                    e(i + 1) = cosl * e(i + 1);
+                }
+                work(i - ll + 1) = cosr;
+                work(i - ll + 1 + nm1) = sinr;
+                work(i - ll + 1 + nm12) = cosl;
+                work(i - ll + 1 + nm13) = sinl;
+            }
+        L140:
+            e(m - 1) = f;
+            /*
        *           update singular vectors
        */
 #ifdef USE_VT
-      if (ncvt > 0) {
-        // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
-        // ll, 1 ), ldvt )
-        char side = 'L';
-        char pivot = 'V';
-        char direct = 'F';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n),
-                        vt(ll, 1), ldvt, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1),
-                    ldvt);
-        }
-      }
+            if(ncvt > 0)
+            {
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
+                // ll, 1 ), ldvt )
+                char side = 'L';
+                char pivot = 'V';
+                char direct = 'F';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt,
+                                  dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt);
+                }
+            }
 #else
 
-      if (nrv > 0) {
-        // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), v( 1, ll ), ldv )
-        char side = 'R';
-        char pivot = 'V';
-        char direct = 'F';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n),
-                        v(1,ll), ldv, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1,ll),
-                    ldv);
-        }
-      }
+            if(nrv > 0)
+            {
+                // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), v( 1, ll ), ldv )
+                char side = 'R';
+                char pivot = 'V';
+                char direct = 'F';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv,
+                                  dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv);
+                }
+            }
 #endif
 
-      if (nru > 0) {
-        // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
-        // ), u( 1, ll ), ldu )
-        char side = 'R';
-        char pivot = 'V';
-        char direct = 'F';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1),
-                        work(nm13 + 1), u(1, ll), ldu, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1),
-                    work(nm13 + 1), u(1, ll), ldu);
-        }
-      }
-      if (ncc > 0) {
-        // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
-        // ), c( ll, 1 ), ldc )
-        char side = 'L';
-        char pivot = 'V';
-        char direct = 'F';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1),
-                        work(nm13 + 1), c(ll, 1), ldc, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1),
-                    work(nm13 + 1), c(ll, 1), ldc);
-        }
-      }
-      /*
+            if(nru > 0)
+            {
+                // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
+                // ), u( 1, ll ), ldu )
+                char side = 'R';
+                char pivot = 'V';
+                char direct = 'F';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                                  u(1, ll), ldu, dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                              u(1, ll), ldu);
+                }
+            }
+            if(ncc > 0)
+            {
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
+                // ), c( ll, 1 ), ldc )
+                char side = 'L';
+                char pivot = 'V';
+                char direct = 'F';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                                  c(ll, 1), ldc, dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                              c(ll, 1), ldc);
+                }
+            }
+            /*
        *           test convergence
        */
-      if (abs(e(m - 1)) <= thresh)
-        e(m - 1) = zero;
-
-    } else {
-      /*
+            if(abs(e(m - 1)) <= thresh)
+                e(m - 1) = zero;
+        }
+        else
+        {
+            /*
        *           chase bulge from bottom to top
        *           save cosines and sines for later singular vector updates
        */
-      f = (abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m));
-      g = e(m - 1);
-      // do 150 i = m, ll + 1, -1
-      for (i = m; i >= (ll + 1); i--) {
-        call_lartg(f, g, cosr, sinr, r);
-        if (i < m)
-          e(i) = r;
-        f = cosr * d(i) + sinr * e(i - 1);
-        e(i - 1) = cosr * e(i - 1) - sinr * d(i);
-        g = sinr * d(i - 1);
-        d(i - 1) = cosr * d(i - 1);
-        call_lartg(f, g, cosl, sinl, r);
-        d(i) = r;
-        f = cosl * e(i - 1) + sinl * d(i - 1);
-        d(i - 1) = cosl * d(i - 1) - sinl * e(i - 1);
-        if (i > ll + 1) {
-          g = sinl * e(i - 2);
-          e(i - 2) = cosl * e(i - 2);
-        }
-        work(i - ll) = cosr;
-        work(i - ll + nm1) = -sinr;
-        work(i - ll + nm12) = cosl;
-        work(i - ll + nm13) = -sinl;
-      }
-    L150:
-      e(ll) = f;
-      /*
+            f = (abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m));
+            g = e(m - 1);
+            // do 150 i = m, ll + 1, -1
+            for(i = m; i >= (ll + 1); i--)
+            {
+                call_lartg(f, g, cosr, sinr, r);
+                if(i < m)
+                    e(i) = r;
+                f = cosr * d(i) + sinr * e(i - 1);
+                e(i - 1) = cosr * e(i - 1) - sinr * d(i);
+                g = sinr * d(i - 1);
+                d(i - 1) = cosr * d(i - 1);
+                call_lartg(f, g, cosl, sinl, r);
+                d(i) = r;
+                f = cosl * e(i - 1) + sinl * d(i - 1);
+                d(i - 1) = cosl * d(i - 1) - sinl * e(i - 1);
+                if(i > ll + 1)
+                {
+                    g = sinl * e(i - 2);
+                    e(i - 2) = cosl * e(i - 2);
+                }
+                work(i - ll) = cosr;
+                work(i - ll + nm1) = -sinr;
+                work(i - ll + nm12) = cosl;
+                work(i - ll + nm13) = -sinl;
+            }
+        L150:
+            e(ll) = f;
+            /*
        *           test convergence
        */
-      if (abs(e(ll)) <= thresh)
-        e(ll) = zero;
-      /*
+            if(abs(e(ll)) <= thresh)
+                e(ll) = zero;
+                /*
        *           update singular vectors if desired
        */
 #ifdef USE_VT
-      if (ncvt > 0) {
-        // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
-        // nm13+1
-        // ), vt( ll, 1 ), ldvt )
-        char side = 'L';
-        char pivot = 'V';
-        char direct = 'B';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1),
-                        work(nm13 + 1), vt(ll, 1), ldvt, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1),
-                    work(nm13 + 1), vt(ll, 1), ldvt);
-        }
-      }
+            if(ncvt > 0)
+            {
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
+                // nm13+1
+                // ), vt( ll, 1 ), ldvt )
+                char side = 'L';
+                char pivot = 'V';
+                char direct = 'B';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                                  vt(ll, 1), ldvt, dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                              vt(ll, 1), ldvt);
+                }
+            }
 #else
 
-      if (nrv > 0) {
-        // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv )
-        char side = 'L';
-        char pivot = 'V';
-        char direct = 'B';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1),
-                        work(nm13 + 1), v(1,ll), ldv, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1),
-                    work(nm13 + 1), v(1,ll), ldv);
-        }
-      }
+            if(nrv > 0)
+            {
+                // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv )
+                char side = 'L';
+                char pivot = 'V';
+                char direct = 'B';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1),
+                                  v(1, ll), ldv, dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1),
+                              v(1, ll), ldv);
+                }
+            }
 #endif
-      if (nru > 0) {
-        // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
-        // ll
-        // ), ldu )
-        char side = 'R';
-        char pivot = 'V';
-        char direct = 'B';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n),
-                        u(1, ll), ldu, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll),
-                    ldu);
-        }
-      }
-      if (ncc > 0) {
-        // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
-        // 1
-        // ), ldc )
-        char side = 'L';
-        char pivot = 'V';
-        char direct = 'B';
-        auto mm = m - ll + 1;
-        if (use_gpu) {
-          call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n),
-                        c(ll, 1), ldc, dwork, stream);
-        } else {
-          call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1),
-                    ldc);
+            if(nru > 0)
+            {
+                // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
+                // ll
+                // ), ldu )
+                char side = 'R';
+                char pivot = 'V';
+                char direct = 'B';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu,
+                                  dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu);
+                }
+            }
+            if(ncc > 0)
+            {
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
+                // 1
+                // ), ldc )
+                char side = 'L';
+                char pivot = 'V';
+                char direct = 'B';
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc,
+                                  dwork, stream);
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc);
+                }
+            }
         }
-      }
     }
-  }
-  /*
+    /*
    *     qr iteration finished, go back and check convergence
    */
-  goto L60;
+    goto L60;
 
 /*
  *     all singular values converged, so make them positive
  */
 L160:
-  // do 170 i = 1, n
-  for (i = 1; i <= n; i++) {
-    if (d(i) < zero) {
-      d(i) = -d(i);
-      /*
+    // do 170 i = 1, n
+    for(i = 1; i <= n; i++)
+    {
+        if(d(i) < zero)
+        {
+            d(i) = -d(i);
+            /*
        *           change sign of singular vectors, if desired
        */
 #ifdef USE_VT
-      if (ncvt > 0) {
-        if (use_gpu) {
-          call_scal_gpu(ncvt, negone, vt(i, 1), ldvt);
-        } else {
-          call_scal(ncvt, negone, vt(i, 1), ldvt);
-        }
-      }
+            if(ncvt > 0)
+            {
+                if(use_gpu)
+                {
+                    call_scal_gpu(ncvt, negone, vt(i, 1), ldvt);
+                }
+                else
+                {
+                    call_scal(ncvt, negone, vt(i, 1), ldvt);
+                }
+            }
 #else
 
-      if (nrv > 0) {
-        if (use_gpu) {
-          call_scal_gpu(nrv, negone, v(1,i), 1);
-        } else {
-          call_scal(nrv, negone, v(1,i), 1);
-        }
-      }
+            if(nrv > 0)
+            {
+                if(use_gpu)
+                {
+                    call_scal_gpu(nrv, negone, v(1, i), 1);
+                }
+                else
+                {
+                    call_scal(nrv, negone, v(1, i), 1);
+                }
+            }
 #endif
+        }
     }
-  }
 L170:
 
-  bool const need_sort = false;
-  if (need_sort) {
-  /*
+    if(need_sort)
+    {
+        /*
    *     sort the singular values into decreasing order (insertion sort on
    *     singular values, but only one transposition per singular vector)
    */
-  // do 190 i = 1, n - 1
-  for (i = 1; i <= (n - 1); i++) {
-    /*
+        // do 190 i = 1, n - 1
+        for(i = 1; i <= (n - 1); i++)
+        {
+            /*
      *        scan for smallest d(i)
      */
-    isub = 1;
-    smin = d(1);
-    // do 180 j = 2, n + 1 - i
-    for (j = 2; j <= (n + 1 - i); j++) {
-      if (d(j) <= smin) {
-        isub = j;
-        smin = d(j);
-      }
-    }
-  L180:
-    if (isub != n + 1 - i) {
-      /*
+            isub = 1;
+            smin = d(1);
+            // do 180 j = 2, n + 1 - i
+            for(j = 2; j <= (n + 1 - i); j++)
+            {
+                if(d(j) <= smin)
+                {
+                    isub = j;
+                    smin = d(j);
+                }
+            }
+        L180:
+            if(isub != n + 1 - i)
+            {
+                /*
        *           swap singular values and vectors
        */
-      d(isub) = d(n + 1 - i);
-      d(n + 1 - i) = smin;
+                d(isub) = d(n + 1 - i);
+                d(n + 1 - i) = smin;
 #ifdef USE_VT
-      if (ncvt > 0) {
-        if (use_gpu) {
-          call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
-        } else {
-          call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
-        }
-      }
+                if(ncvt > 0)
+                {
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+                    }
+                    else
+                    {
+                        call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+                    }
+                }
 #else
 
-      if (nrv > 0) {
-        if (use_gpu) {
-          call_swap_gpu(nrv, v(1,isub), 1, v(1,n + 1 - i), 1);
-        } else {
-          call_swap(nrv, v(1,isub), 1, v(1,n + 1 - i), 1);
-        }
-      }
+                if(nrv > 0)
+                {
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(nrv, v(1, isub), ione, v(1, n + 1 - i), ione);
+                    }
+                    else
+                    {
+                        call_swap(nrv, v(1, isub), ione, v(1, n + 1 - i), ione);
+                    }
+                }
 #endif
-      if (nru > 0) {
-        if (use_gpu) {
-          call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
-        } else {
-          call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
-        }
-      }
-      if (ncc > 0) {
-        if (use_gpu) {
-          call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
-        } else {
-          call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+                if(nru > 0)
+                {
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
+                    }
+                    else
+                    {
+                        call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
+                    }
+                }
+                if(ncc > 0)
+                {
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+                    }
+                    else
+                    {
+                        call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+                    }
+                }
+            }
         }
-      }
-    }
-  }
-  } // end if (need_sort)
+    } // end if (need_sort)
 
 L190:
-  goto L220;
+    goto L220;
 /*
  *     maximum number of iterations exceeded, failure to converge
  */
 L200:
-  info = 0;
-  // do 210 i = 1, n - 1
-  for (i = 1; i <= (n - 1); i++) {
-    if (e(i) != zero)
-      info = info + 1;
-  }
+    info = 0;
+    // do 210 i = 1, n - 1
+    for(i = 1; i <= (n - 1); i++)
+    {
+        if(e(i) != zero)
+            info = info + 1;
+    }
 L210:
 L220:
-  return;
-  /*
+    return;
+    /*
    *     end of dbdsqr
    */
 }
 
-
-
-
-template <typename T, typename S, typename W1, typename W2, typename W3>
+template <typename T, typename S, typename W1, typename W2, typename W3, typename I = rocblas_int>
 rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
-                                        const rocblas_fill uplo_in,
-                                        const rocblas_int n,
-                                        const rocblas_int nv,
-                                        const rocblas_int nu,
-                                        const rocblas_int nc,
-                                        S* D,
-                                        const rocblas_stride strideD,
-                                        S* E,
-                                        const rocblas_stride strideE,
-                                        W1 V,
-                                        const rocblas_int shiftV,
-                                        const rocblas_int ldv,
-                                        const rocblas_stride strideV,
-                                        W2 U,
-                                        const rocblas_int shiftU,
-                                        const rocblas_int ldu,
-                                        const rocblas_stride strideU,
-                                        W3 C,
-                                        const rocblas_int shiftC,
-                                        const rocblas_int ldc,
-                                        const rocblas_stride strideC,
-                                        rocblas_int* info_array,
-                                        const rocblas_int batch_count,
-                                        rocblas_int* splits_map,
-                                        S* work)
+                                                   const rocblas_fill uplo_in,
+                                                   const I n,
+                                                   const I nv,
+                                                   const I nu,
+                                                   const I nc,
+                                                   S* D,
+                                                   const rocblas_stride strideD,
+                                                   S* E,
+                                                   const rocblas_stride strideE,
+                                                   W1 V,
+                                                   const I shiftV,
+                                                   const I ldv,
+                                                   const rocblas_stride strideV,
+                                                   W2 U,
+                                                   const I shiftU,
+                                                   const I ldu,
+                                                   const rocblas_stride strideU,
+                                                   W3 C,
+                                                   const I shiftC,
+                                                   const I ldc,
+                                                   const rocblas_stride strideC,
+                                                   I* info_array,
+                                                   const I batch_count,
+                                                   I* splits_map,
+                                                   S* work)
 {
+    // -------------------------
+    // copy D into hD, E into hE
+    // -------------------------
 
-  // -------------------------
-  // copy D into hD, E into hE
-  // -------------------------
-
-hipStream_t stream;
-rocblas_get_stream(handle, &stream );
-
-
+    hipStream_t stream;
+    rocblas_get_stream(handle, &stream);
 
-S * hD = nullptr;
-S * hE = nullptr;
-HIP_CHECK( hipHostMalloc( &hD, sizeof(S) * n * batch_count ));
-HIP_CHECK( hipHostMalloc( &hE, sizeof(S) * (n-1) * batch_count ));
+    S* hD = nullptr;
+    S* hE = nullptr;
+    HIP_CHECK(hipHostMalloc(&hD, sizeof(S) * n * batch_count));
 
-bool const use_single_copy_for_D = (strideD == n);
-if (use_single_copy_for_D) {
-	void * const dst = (void *) &(hD[0]);
-	void * const src = (void *) D;
-	size_t const sizeBytes = sizeof(S) * n * batch_count;
-	hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+    // ------------------------------------------------------------------
+    // Need to double checking whether array E is length n or length (n-1)
+    // ------------------------------------------------------------------
+    HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * (n - 1) * batch_count));
 
-	HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
-}
-else {
-   for(rocblas_int bid = 0; bid < batch_count; bid++) {
-	   void * const dst = (void *) &(hD[ bid * n ]);
-	   void * const src = (void *) D+ bid * strideD; 
-	   size_t const sizeBytes = sizeof(S) * n;
-	   hipMemcpyKind const kind = hipMemcpyDeviceToHost;
-	   HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
-   }
-}
-
-bool const use_single_copy_for_E = (strideE == (n-1));
-if (use_single_copy_for_E) {
-	void * const dst = (void *) &(hE[0]);
-	void * const src = (void *) E;
-	size_t const sizeBytes = sizeof(S) * (n-1) * batch_count;
-	hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+    // -------------------------------------------------
+    // transfer arrays D(:) and E(:) from Device to Host
+    // -------------------------------------------------
 
-	HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
-}
-else {
-  for(rocblas_int bid = 0; bid < batch_count; bid++) {
-	   void * const dst = (void *) &(hE[ bid * (n-1) ]);
-	   void * const src = (void *) E+ bid * strideE; 
-	   size_t const sizeBytes = sizeof(S) * (n-1);
-	   hipMemcpyKind const kind = hipMemcpyDeviceToHost;
-	   HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
-  }
-
-}
+    bool const use_single_copy_for_D = (strideD == n);
+    if(use_single_copy_for_D)
+    {
+        void* const dst = (void*)&(hD[0]);
+        void* const src = (void*)&(D[0]);
+        size_t const sizeBytes = sizeof(S) * n * batch_count;
+        hipMemcpyKind const kind = hipMemcpyDeviceToHost;
 
+        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+    }
+    else
+    {
+        for(I bid = 0; bid < batch_count; bid++)
+        {
+            void* const dst = (void*)&(hD[bid * n]);
+            void* const src = (void*)&(D[bid * strideD]);
+            size_t const sizeBytes = sizeof(S) * n;
+            hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+        }
+    }
 
+    bool const use_single_copy_for_E = (strideE == (n - 1));
+    if(use_single_copy_for_E)
+    {
+        void* const dst = (void*)&(hE[0]);
+        void* const src = (void*)&(E[0]);
+        size_t const sizeBytes = sizeof(S) * (n - 1) * batch_count;
+        hipMemcpyKind const kind = hipMemcpyDeviceToHost;
 
+        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+    }
+    else
+    {
+        for(I bid = 0; bid < batch_count; bid++)
+        {
+            void* const dst = (void*)&(hE[bid * (n - 1)]);
+            void* const src = (void*)&(E[bid * strideE]);
+            size_t const sizeBytes = sizeof(S) * (n - 1);
+            hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+        }
+    }
 
-  for(rocblas_int bid = 0; bid < batch_count; bid++) {
-	  std::vector<S> hwork( 4 * n );
+    for(I bid = 0; bid < batch_count; bid++)
+    {
+        std::vector<S> hwork(4 * n);
 
-	  char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U';
-          T *d_ =  &(hD[ bid * n ] );
-	  T *e_ =  &(hE[ bid * (n-1) ]);
+        char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U';
+        S* d_ = &(hD[bid * n]);
+        S* e_ = &(hE[bid * (n - 1)]);
 
-          T *v_ = (nv > 0) ? load_ptr_batch<T>(V, bid, shiftV, strideV) : nullptr;
-	  T *u_ = (nu > 0) ? load_ptr_batch<T>(U, bid, shiftU, strideU) : nullptr;
-	  T *c_ = (nc > 0) ? load_ptr_batch<T>(C, bid, shiftC, strideC) : nullptr;
-	  S * work_ =  &(hwork[0]);
-	  S * dwork =  work;
-	  I info = 0;
+        T* v_ = (nv > 0) ? load_ptr_batch<T>(V, bid, shiftV, strideV) : nullptr;
+        T* u_ = (nu > 0) ? load_ptr_batch<T>(U, bid, shiftU, strideU) : nullptr;
+        T* c_ = (nc > 0) ? load_ptr_batch<T>(C, bid, shiftC, strideC) : nullptr;
+        S* work_ = &(hwork[0]);
+        S* dwork = work;
+        I info = 0;
 
-	  I ncv = nv;
-	  I nru = nu;
-	  I ncc = nc;
+        I nru = nu;
+        I ncc = nc;
 
 #ifdef USE_VT
-	  T *vt_ = v_;
-	  I ldvt = ldv;
-
-	  I nrv = n;
-	  I ncvt = nrv;
-	  bdsqr_single_template( uplo, n, ncvt, nru, ncc, d_, e_, 
-			  vt_,ldvt,
-			  u_,ldu, c_,ldc, work_, info, dwork, stream );
+        // -------------------------------------------------------
+        // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V
+        // as input variable
+        // However, rocsolver bdsqrt accepts variable called "V" and "ldv"
+        // but is actually holding "VT"
+        // -------------------------------------------------------
+        T* vt_ = v_;
+        I ldvt = ldv;
+
+        I nrv = n;
+        I ncvt = nv;
+        bdsqr_single_template<S, T, I>(uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu, c_, ldc,
+                                       work_, info, dwork, stream);
 #else
+        I ncv = nv;
 
-	  bdsqr_single_template( uplo, n, ncv, nru, ncc, d_, e_, 
-			  v_,ldv,
-			  u_,ldu, c_,ldc, work_, info, dwork, stream );
+        bdsqr_single_template<S, T, I>(uplo, n, ncv, nru, ncc, d_, e_, v_, ldv, u_, ldu, c_, ldc,
+                                       work_, info, dwork, stream);
 
 #endif
-   
-
 
-  info_array[bid] = info;
-  } // end for bid
+        info_array[bid] = info;
+    } // end for bid
 
+    // -------------------------------------------------
+    // transfer arrays D(:) and E(:) from host to device
+    // -------------------------------------------------
 
-if (use_single_copy_for_D) {
-	void * const src = (void *) &(hD[0]);
-	void * const dst = (void *) D;
-	size_t const sizeBytes = sizeof(S) * n * batch_count;
-	hipMemcpyKind const kind = hipMemcpyHostToDevice;
-
-	HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
-}
-else {
-   for(rocblas_int bid = 0; bid < batch_count; bid++) {
-	   void * const src = (void *) &(hD[ bid * n ]);
-	   void * const dst = (void *) D+ bid * strideD; 
-	   size_t const sizeBytes = sizeof(S) * n;
-	   hipMemcpyKind const kind = hipMemcpyHostToDevice;
-	   HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
-   }
-}
-
-
-if (use_single_copy_for_E) {
-	void * const src = (void *) &(hE[0]);
-	void * const dst = (void *) E;
-	size_t const sizeBytes = sizeof(S) * (n-1) * batch_count;
-	hipMemcpyKind const kind = hipMemcpyHostToDevice;
+    if(use_single_copy_for_D)
+    {
+        void* const src = (void*)&(hD[0]);
+        void* const dst = (void*)D;
+        size_t const sizeBytes = sizeof(S) * n * batch_count;
+        hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
-	HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
-}
-else {
-  for(rocblas_int bid = 0; bid < batch_count; bid++) {
-	   void * const src = (void *) &(hE[ bid * (n-1) ]);
-	   void * const dst = (void *) E+ bid * strideE; 
-	   size_t const sizeBytes = sizeof(S) * (n-1);
-	   hipMemcpyKind const kind = hipMemcpyHostToDevice;
-	   HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) );
-  }
+        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+    }
+    else
+    {
+        for(I bid = 0; bid < batch_count; bid++)
+        {
+            void* const src = (void*)&(hD[bid * n]);
+            void* const dst = (void*)(D + bid * strideD);
+            size_t const sizeBytes = sizeof(S) * n;
+            hipMemcpyKind const kind = hipMemcpyHostToDevice;
+            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+        }
+    }
 
-}
+    if(use_single_copy_for_E)
+    {
+        void* const src = (void*)&(hE[0]);
+        void* const dst = (void*)E;
+        size_t const sizeBytes = sizeof(S) * (n - 1) * batch_count;
+        hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
-HIP_CHECK( hipHostFree( hD ) ); hD = nullptr;
-HIP_CHECK( hipHostFree( hE ) ); hE = nullptr;
+        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+    }
+    else
+    {
+        for(I bid = 0; bid < batch_count; bid++)
+        {
+            void* const src = (void*)&(hE[bid * (n - 1)]);
+            void* const dst = (void*)(E + bid * strideE);
+            size_t const sizeBytes = sizeof(S) * (n - 1);
+            hipMemcpyKind const kind = hipMemcpyHostToDevice;
+            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+        }
+    }
 
-return( rocblas_status_success );
+    HIP_CHECK(hipHostFree(hD));
+    hD = nullptr;
+    HIP_CHECK(hipHostFree(hE));
+    hE = nullptr;
 
+    return (rocblas_status_success);
 }
 
-
 ROCSOLVER_END_NAMESPACE
diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
index 340dfa32a..deb38adb9 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
@@ -38,6 +38,8 @@
 
 #include <cmath>
 
+#include "bdsqr_host.hpp"
+
 ROCSOLVER_BEGIN_NAMESPACE
 
 /************** Kernels and device functions *******************/
@@ -1079,21 +1081,35 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle,
     ROCSOLVER_LAUNCH_KERNEL((bdsqr_init<T>), grid1, threads1, 0, stream, n, D, strideD, E, strideE,
                             info, maxiter, sfm, tol, splits_map, work, incW, strideW);
 
+    bool const use_bdsqr_host = true;
+    rocblas_status istat = rocblas_status_success;
     if(n > 1)
     {
-        // rotate to upper bidiagonal if necessary
-        if(uplo == rocblas_fill_lower)
+        if(use_bdsqr_host)
         {
-            ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper<T>), grid1, threads2, 0, stream, n, nu, nc,
-                                    D, strideD, E, strideE, U, shiftU, ldu, strideU, C, shiftC, ldc,
-                                    strideC, info, work, strideW);
+            rocblas_fill const uplo_in = uplo;
+            auto const info_array = info;
+            istat = rocsolver_bdsqr_host_batch_template<T, S, W1, W2, W3, rocblas_int>(
+                handle, uplo_in, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U,
+                shiftU, ldu, strideU, C, shiftC, ldc, strideC, info_array, batch_count, splits_map,
+                work);
         }
+        else
+        {
+            // rotate to upper bidiagonal if necessary
+            if(uplo == rocblas_fill_lower)
+            {
+                ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper<T>), grid1, threads2, 0, stream, n, nu,
+                                        nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C,
+                                        shiftC, ldc, strideC, info, work, strideW);
+            }
 
-        // main computation of SVD
-        ROCSOLVER_LAUNCH_KERNEL((bdsqr_kernel<T>), grid2, threads3, 0, stream, n, nv, nu, nc, D,
-                                strideD, E, strideE, V, shiftV, ldv, strideV, U, shiftU, ldu,
-                                strideU, C, shiftC, ldc, strideC, info, maxiter, eps, sfm, tol,
-                                minshift, splits_map, work, incW, strideW);
+            // main computation of SVD
+            ROCSOLVER_LAUNCH_KERNEL((bdsqr_kernel<T>), grid2, threads3, 0, stream, n, nv, nu, nc, D,
+                                    strideD, E, strideE, V, shiftV, ldv, strideV, U, shiftU, ldu,
+                                    strideU, C, shiftC, ldc, strideC, info, maxiter, eps, sfm, tol,
+                                    minshift, splits_map, work, incW, strideW);
+        }
     }
 
     // sort the singular values and vectors
@@ -1101,7 +1117,7 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle,
                             E, strideE, V, shiftV, ldv, strideV, U, shiftU, ldu, strideU, C, shiftC,
                             ldc, strideC, info, splits_map);
 
-    return rocblas_status_success;
+    return istat;
 }
 
 ROCSOLVER_END_NAMESPACE

From 55d0861126d7bb08f122c79854905c7ce17f4e7d Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Thu, 30 May 2024 15:04:58 -0400
Subject: [PATCH 03/35] debug snapshot

---
 library/src/auxiliary/bdsqr_host.hpp         | 75 ++++++++++++++++----
 library/src/auxiliary/rocauxiliary_bdsqr.hpp |  4 +-
 2 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp
index 5d3b0f6c9..69b283317 100644
--- a/library/src/auxiliary/bdsqr_host.hpp
+++ b/library/src/auxiliary/bdsqr_host.hpp
@@ -2365,6 +2365,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
                                                    I* splits_map,
                                                    S* work)
 {
+    int const idebug = 1;
+
     // -------------------------
     // copy D into hD, E into hE
     // -------------------------
@@ -2375,11 +2377,26 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     S* hD = nullptr;
     S* hE = nullptr;
     HIP_CHECK(hipHostMalloc(&hD, sizeof(S) * n * batch_count));
-
     // ------------------------------------------------------------------
     // Need to double checking whether array E is length n or length (n-1)
     // ------------------------------------------------------------------
-    HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * (n - 1) * batch_count));
+    bool const use_single_copy_for_E = (strideE == (n - 1)) || (strideE == (n));
+    size_t E_size = use_single_copy_for_E ? strideE : (n - 1);
+    HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * E_size * batch_count));
+
+    // ----------------------------------------------------
+    // copy info_array[] on device to linfo_array[] on host
+    // ----------------------------------------------------
+    I* linfo_array = nullptr;
+    HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * batch_count));
+    {
+        void* dst = &(linfo_array[0]);
+        void* src = info_array;
+        size_t nbytes = sizeof(I) * batch_count;
+        hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+
+        HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+    }
 
     // -------------------------------------------------
     // transfer arrays D(:) and E(:) from Device to Host
@@ -2407,12 +2424,11 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
-    bool const use_single_copy_for_E = (strideE == (n - 1));
     if(use_single_copy_for_E)
     {
         void* const dst = (void*)&(hE[0]);
         void* const src = (void*)&(E[0]);
-        size_t const sizeBytes = sizeof(S) * (n - 1) * batch_count;
+        size_t const sizeBytes = sizeof(S) * E_size * batch_count;
         hipMemcpyKind const kind = hipMemcpyDeviceToHost;
 
         HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
@@ -2421,21 +2437,27 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     {
         for(I bid = 0; bid < batch_count; bid++)
         {
-            void* const dst = (void*)&(hE[bid * (n - 1)]);
+            void* const dst = (void*)&(hE[bid * E_size]);
             void* const src = (void*)&(E[bid * strideE]);
-            size_t const sizeBytes = sizeof(S) * (n - 1);
+            size_t const sizeBytes = sizeof(S) * E_size;
             hipMemcpyKind const kind = hipMemcpyDeviceToHost;
             HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
         }
     }
 
+    if(idebug >= 1)
+    {
+        printf("batch_count = %d\n", batch_count);
+        printf("n = %d, strideD = %ld, strideE = %ld\n", n, (int64_t)strideD, (int64_t)strideE);
+    }
+
     for(I bid = 0; bid < batch_count; bid++)
     {
         std::vector<S> hwork(4 * n);
 
         char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U';
         S* d_ = &(hD[bid * n]);
-        S* e_ = &(hE[bid * (n - 1)]);
+        S* e_ = &(hE[bid * E_size]);
 
         T* v_ = (nv > 0) ? load_ptr_batch<T>(V, bid, shiftV, strideV) : nullptr;
         T* u_ = (nu > 0) ? load_ptr_batch<T>(U, bid, shiftU, strideU) : nullptr;
@@ -2451,8 +2473,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         // -------------------------------------------------------
         // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V
         // as input variable
-        // However, rocsolver bdsqrt accepts variable called "V" and "ldv"
-        // but is actually holding "VT"
+        // However, rocsolver bdsqr() accepts variable called "V" and "ldv"
+        // but may be  actually holding "VT"
         // -------------------------------------------------------
         T* vt_ = v_;
         I ldvt = ldv;
@@ -2469,7 +2491,10 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 
 #endif
 
-        info_array[bid] = info;
+        if(linfo_array[bid] == 0)
+        {
+            linfo_array[bid] = info;
+        }
     } // end for bid
 
     // -------------------------------------------------
@@ -2501,7 +2526,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     {
         void* const src = (void*)&(hE[0]);
         void* const dst = (void*)E;
-        size_t const sizeBytes = sizeof(S) * (n - 1) * batch_count;
+        size_t const sizeBytes = sizeof(S) * E_size * batch_count;
         hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
         HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
@@ -2510,9 +2535,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     {
         for(I bid = 0; bid < batch_count; bid++)
         {
-            void* const src = (void*)&(hE[bid * (n - 1)]);
+            void* const src = (void*)&(hE[bid * E_size]);
             void* const dst = (void*)(E + bid * strideE);
-            size_t const sizeBytes = sizeof(S) * (n - 1);
+            size_t const sizeBytes = sizeof(S) * E_size;
             hipMemcpyKind const kind = hipMemcpyHostToDevice;
             HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
         }
@@ -2523,6 +2548,30 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     HIP_CHECK(hipHostFree(hE));
     hE = nullptr;
 
+    if(idebug >= 1)
+    {
+        for(auto bid = 0; bid < batch_count; bid++)
+        {
+            printf("linfo_array[%d] = %d\n", bid, linfo_array[bid]);
+        }
+    }
+
+    {
+        // ------------------------------------------------------
+        // copy linfo_array[] from host to info_array[] on device
+        // ------------------------------------------------------
+
+        void* dst = (void*)info_array;
+        void* src = (void*)&(linfo_array[0]);
+        size_t nbytes = sizeof(I) * batch_count;
+        hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+        HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+    }
+
+    HIP_CHECK(hipHostFree(linfo_array));
+    linfo_array = nullptr;
+
     return (rocblas_status_success);
 }
 
diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
index deb38adb9..5718f50e0 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
@@ -1088,11 +1088,9 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle,
         if(use_bdsqr_host)
         {
             rocblas_fill const uplo_in = uplo;
-            auto const info_array = info;
             istat = rocsolver_bdsqr_host_batch_template<T, S, W1, W2, W3, rocblas_int>(
                 handle, uplo_in, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U,
-                shiftU, ldu, strideU, C, shiftC, ldc, strideC, info_array, batch_count, splits_map,
-                work);
+                shiftU, ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work);
         }
         else
         {

From db55c9874a565b62527a647e86963c41522d6aaa Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Tue, 4 Jun 2024 08:39:52 -0400
Subject: [PATCH 04/35] debug snapshot

---
 library/src/auxiliary/bdsqr_host.hpp | 82 +++++++++++++++++++++++++---
 1 file changed, 74 insertions(+), 8 deletions(-)

diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp
index 69b283317..7bd8cd717 100644
--- a/library/src/auxiliary/bdsqr_host.hpp
+++ b/library/src/auxiliary/bdsqr_host.hpp
@@ -1,4 +1,4 @@
-#define USE_VT
+#undef USE_VT
 
 /****************************************************************************
  * Derived from the BSD3-licensed
@@ -736,6 +736,22 @@ void slasv2_(float* f,
              float* snl,
              float* csl);
 
+void crot_(int* n,
+           std::complex<float>* cx,
+           int* incx,
+           std::complex<float>* cy,
+           int* incy,
+           float* c,
+           std::complex<float>* s);
+
+void zrot_(int* n,
+           std::complex<double>* cx,
+           int* incx,
+           std::complex<double>* cy,
+           int* incy,
+           double* c,
+           std::complex<double>* s);
+
 void zdrot_(int* n,
             std::complex<double>* zx,
             int* incx,
@@ -1181,6 +1197,7 @@ static void bdsqr_single_template(char uplo,
               S* const ds = dwork + mn_m1;
               HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
               HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
+              HIP_CHECK(hipStreamSynchronize(stream));
 
               lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream);
               HIP_CHECK(hipStreamSynchronize(stream));
@@ -1253,6 +1270,15 @@ static void bdsqr_single_template(char uplo,
    *     test the input parameters.
    *
    */
+    int const idebug = 2;
+    if(idebug >= 2)
+    {
+        printf("single entry\n");
+        for(auto i = 1; i <= (n - 1); i++)
+        {
+            printf("e(%d) = %le\n", i, e(i));
+        }
+    }
 
     info = (!upper) && (!lower) ? -1
         : (n < 0)               ? -2
@@ -1528,11 +1554,11 @@ static void bdsqr_single_template(char uplo,
         {
             if(use_gpu)
             {
-                call_rot_gpu(nrv, v(1, m - 1), 1, v(1, m), 1, cosr, sinr);
+                call_rot_gpu(nrv, v(1, m - 1), ione, v(1, m), ione, cosr, sinr);
             }
             else
             {
-                call_rot(nrv, v(1, m - 1), 1, v(1, m), 1, cosr, sinr);
+                call_rot(nrv, v(1, m - 1), ione, v(1, m), ione, cosr, sinr);
             }
         }
 #endif
@@ -2221,11 +2247,11 @@ static void bdsqr_single_template(char uplo,
             {
                 if(use_gpu)
                 {
-                    call_scal_gpu(nrv, negone, v(1, i), 1);
+                    call_scal_gpu(nrv, negone, v(1, i), ione);
                 }
                 else
                 {
-                    call_scal(nrv, negone, v(1, i), 1);
+                    call_scal(nrv, negone, v(1, i), ione);
                 }
             }
 #endif
@@ -2331,6 +2357,14 @@ static void bdsqr_single_template(char uplo,
     }
 L210:
 L220:
+    if(idebug >= 2)
+    {
+        printf("single exit\n");
+        for(auto i = 1; i <= (n - 1); i++)
+        {
+            printf("e(%d) = %le\n", i, e(i));
+        }
+    }
     return;
     /*
    *     end of dbdsqr
@@ -2365,7 +2399,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
                                                    I* splits_map,
                                                    S* work)
 {
-    int const idebug = 1;
+    int const idebug = 2;
 
     // -------------------------
     // copy D into hD, E into hE
@@ -2380,7 +2414,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     // ------------------------------------------------------------------
     // Need to double checking whether array E is length n or length (n-1)
     // ------------------------------------------------------------------
-    bool const use_single_copy_for_E = (strideE == (n - 1)) || (strideE == (n));
+    // bool const use_single_copy_for_E = (strideE == (n - 1)) || (strideE == (n));
+    bool const use_single_copy_for_E = false;
+
     size_t E_size = use_single_copy_for_E ? strideE : (n - 1);
     HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * E_size * batch_count));
 
@@ -2402,7 +2438,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     // transfer arrays D(:) and E(:) from Device to Host
     // -------------------------------------------------
 
-    bool const use_single_copy_for_D = (strideD == n);
+    // bool const use_single_copy_for_D = (strideD == n);
+    bool const use_single_copy_for_D = false;
     if(use_single_copy_for_D)
     {
         void* const dst = (void*)&(hD[0]);
@@ -2445,10 +2482,25 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
+    HIP_CHECK(hipStreamSynchronize(stream));
+
     if(idebug >= 1)
     {
         printf("batch_count = %d\n", batch_count);
         printf("n = %d, strideD = %ld, strideE = %ld\n", n, (int64_t)strideD, (int64_t)strideE);
+        printf("nv = %d, nu = %d, nc = %d\n", nv, nu, nc);
+    }
+    if(idebug >= 2)
+    {
+        printf("on entry\n");
+        for(auto i = 0; i < n; i++)
+        {
+            printf("hD[%d] = %le\n", i, hD[i]);
+        }
+        for(auto i = 0; i < (n - 1); i++)
+        {
+            printf("hE[%d] = %le\n", i, hE[i]);
+        }
     }
 
     for(I bid = 0; bid < batch_count; bid++)
@@ -2543,6 +2595,19 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
+    if(idebug >= 2)
+    {
+        printf("on exit\n");
+        for(auto i = 0; i < n; i++)
+        {
+            printf("hD[%d] = %le\n", i, hD[i]);
+        }
+        for(auto i = 0; i < (n - 1); i++)
+        {
+            printf("hE[%d] = %le\n", i, hE[i]);
+        }
+    }
+
     HIP_CHECK(hipHostFree(hD));
     hD = nullptr;
     HIP_CHECK(hipHostFree(hE));
@@ -2567,6 +2632,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
         HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+        HIP_CHECK(hipStreamSynchronize(stream));
     }
 
     HIP_CHECK(hipHostFree(linfo_array));

From 972ce8cf7e62f9d3a2acc8f361e5aa84d03012db Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Sun, 16 Jun 2024 14:40:06 -0400
Subject: [PATCH 05/35] debug snapshot

---
 library/src/auxiliary/bdsqr_host.hpp | 61 ++++++++++++++++++----------
 1 file changed, 39 insertions(+), 22 deletions(-)

diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp
index 7bd8cd717..869114641 100644
--- a/library/src/auxiliary/bdsqr_host.hpp
+++ b/library/src/auxiliary/bdsqr_host.hpp
@@ -1,4 +1,4 @@
-#undef USE_VT
+#define USE_VT
 
 /****************************************************************************
  * Derived from the BSD3-licensed
@@ -68,6 +68,7 @@ __global__ static void lasr_kernel(char const side,
     const auto nthreads_per_block = hipBlockDim_x;
     const auto nthreads = nblocks * nthreads_per_block;
     const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    const auto i_start = tid;
     const auto i_inc = nthreads;
     const auto ij_nb = nthreads;
     const auto ij_start = tid;
@@ -78,6 +79,7 @@ __global__ static void lasr_kernel(char const side,
     auto indx2f = [](auto i, auto j, auto lda) -> int64_t {
         assert((1 <= i));
         assert((1 <= lda));
+        assert(i <= lda);
         assert((1 <= j));
 
         return ((i - 1) + (j - 1) * int64_t(lda));
@@ -150,7 +152,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = 1 + tid; i <= n; i += i_inc)
+                    for(I i = 1 + i_start; i <= n; i += i_inc)
                     {
                         const auto temp = A(j + 1, i);
                         A(j + 1, i) = ctemp * temp - stemp * A(j, i);
@@ -182,7 +184,7 @@ __global__ static void lasr_kernel(char const side,
             const auto stemp = s(j);
             if((ctemp != one) || (stemp != zero))
             {
-                for(I i = istart + tid; i <= iend; i += i_inc)
+                for(I i = istart + i_start; i <= iend; i += i_inc)
                 {
                     const auto temp = A(j + 1, i);
                     A(j + 1, i) = ctemp * temp - stemp * A(j, i);
@@ -206,7 +208,7 @@ __global__ static void lasr_kernel(char const side,
             {
                 const auto ctemp = c(j - 1);
                 const auto stemp = s(j - 1);
-                for(I i = 1 + tid; i <= n; i += i_inc)
+                for(I i = 1 + i_start; i <= n; i += i_inc)
                 {
                     const auto temp = A(j, i);
                     A(j, i) = ctemp * temp - stemp * A(1, i);
@@ -237,7 +239,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j - 1);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    for(I i = istart + i_start; i <= iend; i += i_inc)
                     {
                         const auto temp = A(j, i);
 
@@ -270,7 +272,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    for(I i = istart + i_start; i <= iend; i += i_inc)
                     {
                         const auto temp = A(j, i);
                         A(j, i) = stemp * A(m, i) + ctemp * temp;
@@ -302,7 +304,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    for(I i = istart + i_start; i <= iend; i += i_inc)
                     {
                         const auto temp = A(j, i);
                         A(j, i) = stemp * A(m, i) + ctemp * temp;
@@ -335,7 +337,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    for(I i = istart + i_start; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j + 1);
                         A(i, j + 1) = ctemp * temp - stemp * A(i, j);
@@ -368,7 +370,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    for(I i = istart + i_start; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j + 1);
                         A(i, j + 1) = ctemp * temp - stemp * A(i, j);
@@ -400,7 +402,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j - 1);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    for(I i = istart + i_start; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j);
 
@@ -434,7 +436,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j - 1);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    for(I i = istart + i_start; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j);
 
@@ -468,7 +470,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    for(I i = istart + i_start; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j);
 
@@ -502,7 +504,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    for(I i = istart + i_start; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j);
                         A(i, j) = stemp * A(i, n) + ctemp * temp;
@@ -1197,10 +1199,10 @@ static void bdsqr_single_template(char uplo,
               S* const ds = dwork + mn_m1;
               HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
               HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
-              HIP_CHECK(hipStreamSynchronize(stream));
 
               lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream);
-              HIP_CHECK(hipStreamSynchronize(stream));
+              // HIP_CHECK(hipStreamSynchronize(stream));
+              HIP_CHECK(hipDeviceSynchronize());
           };
 
     auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); };
@@ -1278,6 +1280,7 @@ static void bdsqr_single_template(char uplo,
         {
             printf("e(%d) = %le\n", i, e(i));
         }
+        fflush(stdout);
     }
 
     info = (!upper) && (!lower) ? -1
@@ -2121,14 +2124,18 @@ static void bdsqr_single_template(char uplo,
             }
         L150:
             e(ll) = f;
-            /*
-       *           test convergence
-       */
+	       // ----------------
+               // test convergence
+	       // ----------------
             if(abs(e(ll)) <= thresh)
                 e(ll) = zero;
-                /*
-       *           update singular vectors if desired
-       */
+
+
+
+	       // ----------------------------------
+	       // update singular vectors if desired
+	       // ----------------------------------
+
 #ifdef USE_VT
             if(ncvt > 0)
             {
@@ -2209,6 +2216,7 @@ static void bdsqr_single_template(char uplo,
                     call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc);
                 }
             }
+
         }
     }
     /*
@@ -2364,6 +2372,7 @@ static void bdsqr_single_template(char uplo,
         {
             printf("e(%d) = %le\n", i, e(i));
         }
+        fflush(stdout);
     }
     return;
     /*
@@ -2503,6 +2512,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
+        S * dwork = nullptr;
+	HIP_CHECK( hipMalloc( &dwork, sizeof(S)*(4*n)) );
+
     for(I bid = 0; bid < batch_count; bid++)
     {
         std::vector<S> hwork(4 * n);
@@ -2515,7 +2527,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         T* u_ = (nu > 0) ? load_ptr_batch<T>(U, bid, shiftU, strideU) : nullptr;
         T* c_ = (nc > 0) ? load_ptr_batch<T>(C, bid, shiftC, strideC) : nullptr;
         S* work_ = &(hwork[0]);
-        S* dwork = work;
+        // S* dwork = &(work[bid * (4 * n)]);
+
+
         I info = 0;
 
         I nru = nu;
@@ -2613,6 +2627,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     HIP_CHECK(hipHostFree(hE));
     hE = nullptr;
 
+    HIP_CHECK( hipFree( dwork ));
+    dwork = nullptr;
+
     if(idebug >= 1)
     {
         for(auto bid = 0; bid < batch_count; bid++)

From 62d6b5abdca0da6f3a60a31c8b73fc54e8ed1930 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Sun, 16 Jun 2024 21:03:32 -0400
Subject: [PATCH 06/35] debug snapshot

---
 library/src/auxiliary/bdsqr_host.hpp | 608 ++++++++++++++-------------
 1 file changed, 306 insertions(+), 302 deletions(-)

diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp
index 869114641..c368c271d 100644
--- a/library/src/auxiliary/bdsqr_host.hpp
+++ b/library/src/auxiliary/bdsqr_host.hpp
@@ -1,4 +1,3 @@
-#define USE_VT
 
 /****************************************************************************
  * Derived from the BSD3-licensed
@@ -45,8 +44,8 @@
 
 ROCSOLVER_BEGIN_NAMESPACE
 
-#ifndef HIP_CHECK
-#define HIP_CHECK(fcn)                  \
+#ifndef CHECK_HIP
+#define CHECK_HIP(fcn)                  \
     {                                   \
         hipError_t const istat = (fcn); \
         assert(istat == hipSuccess);    \
@@ -68,7 +67,6 @@ __global__ static void lasr_kernel(char const side,
     const auto nthreads_per_block = hipBlockDim_x;
     const auto nthreads = nblocks * nthreads_per_block;
     const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-    const auto i_start = tid;
     const auto i_inc = nthreads;
     const auto ij_nb = nthreads;
     const auto ij_start = tid;
@@ -79,7 +77,6 @@ __global__ static void lasr_kernel(char const side,
     auto indx2f = [](auto i, auto j, auto lda) -> int64_t {
         assert((1 <= i));
         assert((1 <= lda));
-        assert(i <= lda);
         assert((1 <= j));
 
         return ((i - 1) + (j - 1) * int64_t(lda));
@@ -152,7 +149,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = 1 + i_start; i <= n; i += i_inc)
+                    for(I i = 1 + tid; i <= n; i += i_inc)
                     {
                         const auto temp = A(j + 1, i);
                         A(j + 1, i) = ctemp * temp - stemp * A(j, i);
@@ -184,7 +181,7 @@ __global__ static void lasr_kernel(char const side,
             const auto stemp = s(j);
             if((ctemp != one) || (stemp != zero))
             {
-                for(I i = istart + i_start; i <= iend; i += i_inc)
+                for(I i = istart + tid; i <= iend; i += i_inc)
                 {
                     const auto temp = A(j + 1, i);
                     A(j + 1, i) = ctemp * temp - stemp * A(j, i);
@@ -208,7 +205,7 @@ __global__ static void lasr_kernel(char const side,
             {
                 const auto ctemp = c(j - 1);
                 const auto stemp = s(j - 1);
-                for(I i = 1 + i_start; i <= n; i += i_inc)
+                for(I i = 1 + tid; i <= n; i += i_inc)
                 {
                     const auto temp = A(j, i);
                     A(j, i) = ctemp * temp - stemp * A(1, i);
@@ -239,7 +236,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j - 1);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + i_start; i <= iend; i += i_inc)
+                    for(I i = istart + tid; i <= iend; i += i_inc)
                     {
                         const auto temp = A(j, i);
 
@@ -272,7 +269,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + i_start; i <= iend; i += i_inc)
+                    for(I i = istart + tid; i <= iend; i += i_inc)
                     {
                         const auto temp = A(j, i);
                         A(j, i) = stemp * A(m, i) + ctemp * temp;
@@ -304,7 +301,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + i_start; i <= iend; i += i_inc)
+                    for(I i = istart + tid; i <= iend; i += i_inc)
                     {
                         const auto temp = A(j, i);
                         A(j, i) = stemp * A(m, i) + ctemp * temp;
@@ -337,7 +334,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + i_start; i <= iend; i += i_inc)
+                    for(I i = istart + tid; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j + 1);
                         A(i, j + 1) = ctemp * temp - stemp * A(i, j);
@@ -370,7 +367,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + i_start; i <= iend; i += i_inc)
+                    for(I i = istart + tid; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j + 1);
                         A(i, j + 1) = ctemp * temp - stemp * A(i, j);
@@ -402,7 +399,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j - 1);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + i_start; i <= iend; i += i_inc)
+                    for(I i = istart + tid; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j);
 
@@ -436,7 +433,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j - 1);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + i_start; i <= iend; i += i_inc)
+                    for(I i = istart + tid; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j);
 
@@ -470,7 +467,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + i_start; i <= iend; i += i_inc)
+                    for(I i = istart + tid; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j);
 
@@ -504,7 +501,7 @@ __global__ static void lasr_kernel(char const side,
                 const auto stemp = s(j);
                 if((ctemp != one) || (stemp != zero))
                 {
-                    for(I i = istart + i_start; i <= iend; i += i_inc)
+                    for(I i = istart + tid; i <= iend; i += i_inc)
                     {
                         const auto temp = A(i, j);
                         A(i, j) = stemp * A(i, n) + ctemp * temp;
@@ -519,6 +516,7 @@ __global__ static void lasr_kernel(char const side,
 
     return;
 }
+
 template <typename S, typename T, typename I>
 static void lasr_template_gpu(char const side,
                               char const pivot,
@@ -738,22 +736,6 @@ void slasv2_(float* f,
              float* snl,
              float* csl);
 
-void crot_(int* n,
-           std::complex<float>* cx,
-           int* incx,
-           std::complex<float>* cy,
-           int* incy,
-           float* c,
-           std::complex<float>* s);
-
-void zrot_(int* n,
-           std::complex<double>* cx,
-           int* incx,
-           std::complex<double>* cy,
-           int* incy,
-           double* c,
-           std::complex<double>* s);
-
 void zdrot_(int* n,
             std::complex<double>* zx,
             int* incx,
@@ -783,6 +765,7 @@ void sscal_(int* n, float* da, float* zx, int* incx);
 
 void dlartg_(double* f, double* g, double* c, double* s, double* r);
 void slartg_(float* f, float* g, float* c, float* s, float* r);
+
 void zlartg_(std::complex<double>* f,
              std::complex<double>* g,
              double* c,
@@ -798,6 +781,191 @@ void dlas2_(double* f, double* g, double* h, double* ssmin, double* ssmax);
 void slas2_(float* f, float* g, float* h, float* ssmin, float* ssmax);
 };
 
+extern "C" {
+
+void cbdsqr_(char* uplo,
+             int* n,
+             int* ncvt,
+             int* nru,
+             int* ncc,
+             float* d,
+             float* e,
+             std::complex<float>* vt,
+             int* ldvt,
+             std::complex<float>* u,
+             int* ldu,
+             std::complex<float>* c,
+             int* ldc,
+             float* rwork,
+             int* info);
+
+void zbdsqr_(char* uplo,
+             int* n,
+             int* ncvt,
+             int* nru,
+             int* ncc,
+             double* d,
+             double* e,
+             std::complex<double>* vt,
+             int* ldvt,
+             std::complex<double>* u,
+             int* ldu,
+             std::complex<double>* c,
+             int* ldc,
+             double* rwork,
+             int* info);
+
+void sbdsqr_(char* uplo,
+             int* n,
+             int* ncvt,
+             int* nru,
+             int* ncc,
+             float* d,
+             float* e,
+             float* vt,
+             int* ldvt,
+             float* u,
+             int* ldu,
+             float* c,
+             int* ldc,
+             float* rwork,
+             int* info);
+
+void dbdsqr_(char* uplo,
+             int* n,
+             int* ncvt,
+             int* nru,
+             int* ncc,
+             double* d,
+             double* e,
+             double* vt,
+             int* ldvt,
+             double* u,
+             int* ldu,
+             double* c,
+             int* ldc,
+             double* rwork,
+             int* info);
+};
+
+static void call_bdsqr(char& uplo,
+                       int& n,
+                       int& ncvt,
+                       int& nru,
+                       int& ncc,
+                       double& d,
+                       double& e,
+                       std::complex<double>& vt,
+                       int& ldvt,
+                       std::complex<double>& u,
+                       int& ldu,
+                       std::complex<double>& c,
+                       int& ldc,
+                       double& rwork,
+                       int& info)
+{
+    zbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex<double>*)&vt, &ldvt,
+            (std::complex<double>*)&u, &ldu, (std::complex<double>*)&c, &ldc, &rwork, &info);
+}
+
+static void call_bdsqr(char& uplo,
+                       int& n,
+                       int& ncvt,
+                       int& nru,
+                       int& ncc,
+                       double& d,
+                       double& e,
+                       rocblas_complex_num<double>& vt,
+                       int& ldvt,
+                       rocblas_complex_num<double>& u,
+                       int& ldu,
+                       rocblas_complex_num<double>& c,
+                       int& ldc,
+                       double& rwork,
+                       int& info)
+{
+    zbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex<double>*)&vt, &ldvt,
+            (std::complex<double>*)&u, &ldu, (std::complex<double>*)&c, &ldc, &rwork, &info);
+}
+
+static void call_bdsqr(char& uplo,
+                       int& n,
+                       int& ncvt,
+                       int& nru,
+                       int& ncc,
+                       float& d,
+                       float& e,
+                       std::complex<float>& vt,
+                       int& ldvt,
+                       std::complex<float>& u,
+                       int& ldu,
+                       std::complex<float>& c,
+                       int& ldc,
+                       float& rwork,
+                       int& info)
+{
+    cbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex<float>*)&vt, &ldvt,
+            (std::complex<float>*)&u, &ldu, (std::complex<float>*)&c, &ldc, &rwork, &info);
+}
+
+static void call_bdsqr(char& uplo,
+                       int& n,
+                       int& ncvt,
+                       int& nru,
+                       int& ncc,
+                       float& d,
+                       float& e,
+                       float& vt,
+                       int& ldvt,
+                       float& u,
+                       int& ldu,
+                       float& c,
+                       int& ldc,
+                       float& rwork,
+                       int& info)
+{
+    sbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info);
+}
+
+static void call_bdsqr(char& uplo,
+                       int& n,
+                       int& ncvt,
+                       int& nru,
+                       int& ncc,
+                       float& d,
+                       float& e,
+                       rocblas_complex_num<float>& vt,
+                       int& ldvt,
+                       rocblas_complex_num<float>& u,
+                       int& ldu,
+                       rocblas_complex_num<float>& c,
+                       int& ldc,
+                       float& rwork,
+                       int& info)
+{
+    cbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex<float>*)&vt, &ldvt,
+            (std::complex<float>*)&u, &ldu, (std::complex<float>*)&c, &ldc, &rwork, &info);
+}
+
+static void call_bdsqr(char& uplo,
+                       int& n,
+                       int& ncvt,
+                       int& nru,
+                       int& ncc,
+                       double& d,
+                       double& e,
+                       double& vt,
+                       int& ldvt,
+                       double& u,
+                       int& ldu,
+                       double& c,
+                       int& ldc,
+                       double& rwork,
+                       int& info)
+{
+    dbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info);
+}
+
 static void call_lamch(char& cmach_arg, double& eps)
 {
     char cmach = cmach_arg;
@@ -1101,33 +1269,26 @@ static void call_lasr(char& side,
 template <typename S, typename T, typename I>
 static void bdsqr_single_template(char uplo,
                                   I n,
-#ifdef USE_VT
                                   I ncvt,
-#else
-                                  I ncv,
-#endif
                                   I nru,
                                   I ncc,
+
                                   S* d_,
                                   S* e_,
-#ifdef USE_VT
+
                                   T* vt_,
                                   I ldvt,
-#else
-                                  T* v_,
-                                  I ldv,
-#endif
                                   T* u_,
                                   I ldu,
                                   T* c_,
-                                  int ldc,
+                                  I ldc,
+
                                   S* work_,
                                   I& info,
                                   S* dwork = nullptr,
                                   hipStream_t stream = 0)
 {
     bool const use_gpu = (dwork != nullptr);
-    bool constexpr need_sort = false;
 
     S const zero = 0;
     S const one = 1;
@@ -1140,18 +1301,12 @@ static void bdsqr_single_template(char uplo,
     I const maxitr = 6;
     I ione = 1;
 
-    I nrv = n;
-
     bool const lower = (uplo == 'L') || (uplo == 'l');
     bool const upper = (uplo == 'U') || (uplo == 'u');
     /*
    *     rotate is true if any singular vectors desired, false otherwise
    */
-#ifdef USE_VT
     bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0);
-#else
-    bool const rotate = (nrv > 0) || (nru > 0) || (ncc > 0);
-#endif
 
     I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0,
       maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0;
@@ -1197,12 +1352,11 @@ static void bdsqr_single_template(char uplo,
               auto const mn_m1 = (mn - 1);
               S* const dc = dwork;
               S* const ds = dwork + mn_m1;
-              HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
-              HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
+              CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
+              CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
 
               lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream);
-              // HIP_CHECK(hipStreamSynchronize(stream));
-              HIP_CHECK(hipDeviceSynchronize());
+              CHECK_HIP(hipStreamSynchronize(stream));
           };
 
     auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); };
@@ -1236,19 +1390,11 @@ static void bdsqr_single_template(char uplo,
         return (u_[indx2f(i, j, ldu)]);
     };
 
-#ifdef USE_VT
     auto vt = [=](auto i, auto j) -> T& {
         assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt));
         assert((1 <= j) && (j <= ncvt));
         return (vt_[indx2f(i, j, ldvt)]);
     };
-#else
-    auto v = [=](auto i, auto j) -> T& {
-        assert((1 <= i) && (i <= nrv) && (nrv <= ldv));
-        assert((1 <= j) && (j <= ncv));
-        return (v_[indx2f(i, j, ldv)]);
-    };
-#endif
 
     // ---------------------------
     // emulate Fortran  intrinsics
@@ -1272,28 +1418,16 @@ static void bdsqr_single_template(char uplo,
    *     test the input parameters.
    *
    */
-    int const idebug = 2;
-    if(idebug >= 2)
-    {
-        printf("single entry\n");
-        for(auto i = 1; i <= (n - 1); i++)
-        {
-            printf("e(%d) = %le\n", i, e(i));
-        }
-        fflush(stdout);
-    }
 
     info = (!upper) && (!lower) ? -1
         : (n < 0)               ? -2
-#ifdef USE_VT
-        : (ncvt < 0) ? -3
-#else
-        : (nrv < 0) ? -3
-#endif
-        : (nru < 0)           ? -4
-        : (ncc < 0)           ? -5
-        : (ldu < max(1, nru)) ? -11
-                              : 0;
+        : (ncvt < 0)            ? -3
+        : (nru < 0)             ? -4
+        : (ncc < 0)             ? -5
+        : ((ncvt == 0) && (ldvt < 1)) || ((ncvt > 0) && (ldvt < max(1, n)) ? -9 : (ldu < max(1, nru)))
+        ? -11
+        : ((ncc == 0) && (ldc < 1)) || ((ncc > 0) && (ldc < max(1, n))) ? -13
+                                                                        : 0;
 
     if(info != 0)
         return;
@@ -1540,7 +1674,6 @@ static void bdsqr_single_template(char uplo,
         /*
      *        compute singular vectors, if desired
      */
-#ifdef USE_VT
         if(ncvt > 0)
         {
             if(use_gpu)
@@ -1552,19 +1685,6 @@ static void bdsqr_single_template(char uplo,
                 call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr);
             }
         }
-#else
-        if(nrv > 0)
-        {
-            if(use_gpu)
-            {
-                call_rot_gpu(nrv, v(1, m - 1), ione, v(1, m), ione, cosr, sinr);
-            }
-            else
-            {
-                call_rot(nrv, v(1, m - 1), ione, v(1, m), ione, cosr, sinr);
-            }
-        }
-#endif
         if(nru > 0)
         {
             if(use_gpu)
@@ -1758,7 +1878,6 @@ static void bdsqr_single_template(char uplo,
             /*
        *           update singular vectors
        */
-#ifdef USE_VT
             if(ncvt > 0)
             {
                 // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
@@ -1777,27 +1896,6 @@ static void bdsqr_single_template(char uplo,
                     call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt);
                 }
             }
-#else
-
-            if(nrv > 0)
-            {
-                // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ),
-                //            v(1,ll ), ldv)
-                char side = 'R';
-                char pivot = 'V';
-                char direct = 'F';
-                auto mm = m - ll + 1;
-                if(use_gpu)
-                {
-                    call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv,
-                                  dwork, stream);
-                }
-                else
-                {
-                    call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv);
-                }
-            }
-#endif
             if(nru > 0)
             {
                 // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
@@ -1875,7 +1973,6 @@ static void bdsqr_single_template(char uplo,
             /*
        *           update singular vectors
        */
-#ifdef USE_VT
             if(ncvt > 0)
             {
                 // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
@@ -1896,29 +1993,6 @@ static void bdsqr_single_template(char uplo,
                               vt(ll, 1), ldvt);
                 }
             }
-#else
-
-            if(nrv > 0)
-            {
-                // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv );
-                char side = 'R';
-                char pivot = 'V';
-                char direct = 'B';
-                auto mm = m - ll + 1;
-                if(use_gpu)
-                {
-                    call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1),
-                                  v(1, ll), ldv, dwork, stream);
-                }
-                else
-                {
-                    call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1),
-                              v(1, ll), ldv);
-                }
-            }
-
-#endif
-
             if(nru > 0)
             {
                 // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
@@ -2006,7 +2080,6 @@ static void bdsqr_single_template(char uplo,
             /*
        *           update singular vectors
        */
-#ifdef USE_VT
             if(ncvt > 0)
             {
                 // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
@@ -2025,26 +2098,6 @@ static void bdsqr_single_template(char uplo,
                     call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt);
                 }
             }
-#else
-
-            if(nrv > 0)
-            {
-                // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), v( 1, ll ), ldv )
-                char side = 'R';
-                char pivot = 'V';
-                char direct = 'F';
-                auto mm = m - ll + 1;
-                if(use_gpu)
-                {
-                    call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv,
-                                  dwork, stream);
-                }
-                else
-                {
-                    call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv);
-                }
-            }
-#endif
 
             if(nru > 0)
             {
@@ -2124,19 +2177,14 @@ static void bdsqr_single_template(char uplo,
             }
         L150:
             e(ll) = f;
-	       // ----------------
-               // test convergence
-	       // ----------------
+            /*
+       *           test convergence
+       */
             if(abs(e(ll)) <= thresh)
                 e(ll) = zero;
-
-
-
-	       // ----------------------------------
-	       // update singular vectors if desired
-	       // ----------------------------------
-
-#ifdef USE_VT
+            /*
+       *           update singular vectors if desired
+       */
             if(ncvt > 0)
             {
                 // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
@@ -2157,27 +2205,6 @@ static void bdsqr_single_template(char uplo,
                               vt(ll, 1), ldvt);
                 }
             }
-#else
-
-            if(nrv > 0)
-            {
-                // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv )
-                char side = 'L';
-                char pivot = 'V';
-                char direct = 'B';
-                auto mm = m - ll + 1;
-                if(use_gpu)
-                {
-                    call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1),
-                                  v(1, ll), ldv, dwork, stream);
-                }
-                else
-                {
-                    call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1),
-                              v(1, ll), ldv);
-                }
-            }
-#endif
             if(nru > 0)
             {
                 // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
@@ -2216,7 +2243,6 @@ static void bdsqr_single_template(char uplo,
                     call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc);
                 }
             }
-
         }
     }
     /*
@@ -2237,7 +2263,6 @@ static void bdsqr_single_template(char uplo,
             /*
        *           change sign of singular vectors, if desired
        */
-#ifdef USE_VT
             if(ncvt > 0)
             {
                 if(use_gpu)
@@ -2249,107 +2274,73 @@ static void bdsqr_single_template(char uplo,
                     call_scal(ncvt, negone, vt(i, 1), ldvt);
                 }
             }
-#else
-
-            if(nrv > 0)
-            {
-                if(use_gpu)
-                {
-                    call_scal_gpu(nrv, negone, v(1, i), ione);
-                }
-                else
-                {
-                    call_scal(nrv, negone, v(1, i), ione);
-                }
-            }
-#endif
         }
     }
 L170:
-
-    if(need_sort)
-    {
-        /*
+    /*
    *     sort the singular values into decreasing order (insertion sort on
    *     singular values, but only one transposition per singular vector)
    */
-        // do 190 i = 1, n - 1
-        for(i = 1; i <= (n - 1); i++)
-        {
-            /*
+    // do 190 i = 1, n - 1
+    for(i = 1; i <= (n - 1); i++)
+    {
+        /*
      *        scan for smallest d(i)
      */
-            isub = 1;
-            smin = d(1);
-            // do 180 j = 2, n + 1 - i
-            for(j = 2; j <= (n + 1 - i); j++)
+        isub = 1;
+        smin = d(1);
+        // do 180 j = 2, n + 1 - i
+        for(j = 2; j <= (n + 1 - i); j++)
+        {
+            if(d(j) <= smin)
             {
-                if(d(j) <= smin)
+                isub = j;
+                smin = d(j);
+            }
+        }
+    L180:
+        if(isub != n + 1 - i)
+        {
+            /*
+       *           swap singular values and vectors
+       */
+            d(isub) = d(n + 1 - i);
+            d(n + 1 - i) = smin;
+            if(ncvt > 0)
+            {
+                if(use_gpu)
                 {
-                    isub = j;
-                    smin = d(j);
+                    call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+                }
+                else
+                {
+                    call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
                 }
             }
-        L180:
-            if(isub != n + 1 - i)
+            if(nru > 0)
             {
-                /*
-       *           swap singular values and vectors
-       */
-                d(isub) = d(n + 1 - i);
-                d(n + 1 - i) = smin;
-#ifdef USE_VT
-                if(ncvt > 0)
+                if(use_gpu)
                 {
-                    if(use_gpu)
-                    {
-                        call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
-                    }
-                    else
-                    {
-                        call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
-                    }
+                    call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
                 }
-#else
-
-                if(nrv > 0)
+                else
                 {
-                    if(use_gpu)
-                    {
-                        call_swap_gpu(nrv, v(1, isub), ione, v(1, n + 1 - i), ione);
-                    }
-                    else
-                    {
-                        call_swap(nrv, v(1, isub), ione, v(1, n + 1 - i), ione);
-                    }
+                    call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
                 }
-#endif
-                if(nru > 0)
+            }
+            if(ncc > 0)
+            {
+                if(use_gpu)
                 {
-                    if(use_gpu)
-                    {
-                        call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
-                    }
-                    else
-                    {
-                        call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
-                    }
+                    call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
                 }
-                if(ncc > 0)
+                else
                 {
-                    if(use_gpu)
-                    {
-                        call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
-                    }
-                    else
-                    {
-                        call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
-                    }
+                    call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
                 }
             }
         }
-    } // end if (need_sort)
-
+    }
 L190:
     goto L220;
 /*
@@ -2365,15 +2356,6 @@ static void bdsqr_single_template(char uplo,
     }
 L210:
 L220:
-    if(idebug >= 2)
-    {
-        printf("single exit\n");
-        for(auto i = 1; i <= (n - 1); i++)
-        {
-            printf("e(%d) = %le\n", i, e(i));
-        }
-        fflush(stdout);
-    }
     return;
     /*
    *     end of dbdsqr
@@ -2419,15 +2401,14 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 
     S* hD = nullptr;
     S* hE = nullptr;
-    HIP_CHECK(hipHostMalloc(&hD, sizeof(S) * n * batch_count));
+    HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n));
     // ------------------------------------------------------------------
     // Need to double checking whether array E is length n or length (n-1)
     // ------------------------------------------------------------------
-    // bool const use_single_copy_for_E = (strideE == (n - 1)) || (strideE == (n));
-    bool const use_single_copy_for_E = false;
+    bool const use_single_copy_for_E = (batch_count == 1) || (strideE == (n - 1)) || (strideE == (n));
 
-    size_t E_size = use_single_copy_for_E ? strideE : (n - 1);
-    HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * E_size * batch_count));
+    size_t E_size = (batch_count == 1) || (strideE == (n - 1)) ? (n - 1) : n;
+    HIP_CHECK(hipHostMalloc(&hE, (sizeof(S) * std::max(1, batch_count)) * E_size));
 
     // ----------------------------------------------------
     // copy info_array[] on device to linfo_array[] on host
@@ -2447,8 +2428,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     // transfer arrays D(:) and E(:) from Device to Host
     // -------------------------------------------------
 
-    // bool const use_single_copy_for_D = (strideD == n);
-    bool const use_single_copy_for_D = false;
+    bool const use_single_copy_for_D = (batch_count == 1) || (strideD == n);
     if(use_single_copy_for_D)
     {
         void* const dst = (void*)&(hD[0]);
@@ -2512,8 +2492,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
-        S * dwork = nullptr;
-	HIP_CHECK( hipMalloc( &dwork, sizeof(S)*(4*n)) );
+    S* dwork = nullptr;
+    HIP_CHECK(hipMalloc(&dwork, sizeof(S) * (4 * n)));
 
     for(I bid = 0; bid < batch_count; bid++)
     {
@@ -2529,13 +2509,11 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         S* work_ = &(hwork[0]);
         // S* dwork = &(work[bid * (4 * n)]);
 
-
         I info = 0;
 
         I nru = nu;
         I ncc = nc;
 
-#ifdef USE_VT
         // -------------------------------------------------------
         // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V
         // as input variable
@@ -2547,15 +2525,39 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 
         I nrv = n;
         I ncvt = nv;
-        bdsqr_single_template<S, T, I>(uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu, c_, ldc,
-                                       work_, info, dwork, stream);
-#else
-        I ncv = nv;
+        bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0);
+        if(values_only)
+        {
+            // --------------------------------
+            // call the lapack version of bdsqr
+            // --------------------------------
+            auto ln = n;
+            auto lncvt = ncvt;
+            auto lnru = nru;
+            auto lncc = ncc;
+            S& d_arg = d_[0];
+            S& e_arg = e_[0];
+            T& vt_arg = vt_[0];
+            T& u_arg = u_[0];
+            T& c_arg = c_[0];
+            S& work_arg = work_[0];
+            auto ldvt_arg = ldvt;
+            auto ldu_arg = ldu;
+            auto ldc_arg = ldc;
+
+            call_bdsqr(uplo, ln, lncvt, lnru, lncc, d_arg, e_arg, vt_arg, ldvt_arg, u_arg, ldu_arg,
+                       c_arg, ldu_arg, work_arg, info);
+        }
+        else
+        {
+            bdsqr_single_template<S, T, I>(uplo, n, ncvt, nru, ncc,
 
-        bdsqr_single_template<S, T, I>(uplo, n, ncv, nru, ncc, d_, e_, v_, ldv, u_, ldu, c_, ldc,
-                                       work_, info, dwork, stream);
+                                           d_, e_,
 
-#endif
+                                           vt_, ldvt, u_, ldu, c_, ldc,
+
+                                           work_, info, dwork, stream);
+        }
 
         if(linfo_array[bid] == 0)
         {
@@ -2609,6 +2611,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
+    HIP_CHECK(hipStreamSynchronize(stream));
+
     if(idebug >= 2)
     {
         printf("on exit\n");
@@ -2627,7 +2631,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     HIP_CHECK(hipHostFree(hE));
     hE = nullptr;
 
-    HIP_CHECK( hipFree( dwork ));
+    HIP_CHECK(hipFree(dwork));
     dwork = nullptr;
 
     if(idebug >= 1)

From 3fc89f708bebc0e44d3fa3a1b40d364fd94d7c03 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Mon, 17 Jun 2024 15:53:27 -0400
Subject: [PATCH 07/35] debug snapshot

---
 library/src/auxiliary/bdsqr_host.hpp | 111 +++++++++++++++------------
 1 file changed, 60 insertions(+), 51 deletions(-)

diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp
index c368c271d..8ee126a9a 100644
--- a/library/src/auxiliary/bdsqr_host.hpp
+++ b/library/src/auxiliary/bdsqr_host.hpp
@@ -2401,29 +2401,28 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 
     S* hD = nullptr;
     S* hE = nullptr;
-    HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n));
-    // ------------------------------------------------------------------
-    // Need to double checking whether array E is length n or length (n-1)
-    // ------------------------------------------------------------------
-    bool const use_single_copy_for_E = (batch_count == 1) || (strideE == (n - 1)) || (strideE == (n));
 
-    size_t E_size = (batch_count == 1) || (strideE == (n - 1)) ? (n - 1) : n;
+    size_t const E_size = (n - 1);
+    HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n));
     HIP_CHECK(hipHostMalloc(&hE, (sizeof(S) * std::max(1, batch_count)) * E_size));
 
     // ----------------------------------------------------
     // copy info_array[] on device to linfo_array[] on host
     // ----------------------------------------------------
     I* linfo_array = nullptr;
-    HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * batch_count));
+    HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * std::max(1, batch_count)));
     {
-        void* dst = &(linfo_array[0]);
-        void* src = info_array;
-        size_t nbytes = sizeof(I) * batch_count;
+        void* const dst = &(linfo_array[0]);
+        void* const src = &(info_array[0]);
+        size_t const nbytes = sizeof(I) * batch_count;
         hipMemcpyKind const kind = hipMemcpyDeviceToHost;
 
         HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
     }
 
+    S* hwork = nullptr;
+    HIP_CHECK(hipHostMalloc(&hwork, sizeof(S) * (4 * n)));
+
     // -------------------------------------------------
     // transfer arrays D(:) and E(:) from Device to Host
     // -------------------------------------------------
@@ -2450,16 +2449,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
-    if(use_single_copy_for_E)
-    {
-        void* const dst = (void*)&(hE[0]);
-        void* const src = (void*)&(E[0]);
-        size_t const sizeBytes = sizeof(S) * E_size * batch_count;
-        hipMemcpyKind const kind = hipMemcpyDeviceToHost;
-
-        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
-    }
-    else
     {
         for(I bid = 0; bid < batch_count; bid++)
         {
@@ -2497,7 +2486,17 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 
     for(I bid = 0; bid < batch_count; bid++)
     {
-        std::vector<S> hwork(4 * n);
+        if(idebug >= 1)
+        {
+            printf("on entry: linfo_array[%d] = %d\n", bid, linfo_array[bid]);
+        }
+
+        if(linfo_array[bid] != 0)
+        {
+            continue;
+        };
+
+        // std::vector<S> hwork(4 * n);
 
         char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U';
         S* d_ = &(hD[bid * n]);
@@ -2559,6 +2558,18 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
                                            work_, info, dwork, stream);
         }
 
+        if(info == 0)
+        {
+            // ----------------------------
+            // explicitly zero out "E" array
+            // ----------------------------
+            S const zero = S(0);
+            for(I i = 0; i < (n - 1); i++)
+            {
+                e_[i] = zero;
+            }
+        }
+
         if(linfo_array[bid] == 0)
         {
             linfo_array[bid] = info;
@@ -2590,28 +2601,32 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
-    if(use_single_copy_for_E)
-    {
-        void* const src = (void*)&(hE[0]);
-        void* const dst = (void*)E;
-        size_t const sizeBytes = sizeof(S) * E_size * batch_count;
-        hipMemcpyKind const kind = hipMemcpyHostToDevice;
-
-        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
-    }
-    else
     {
         for(I bid = 0; bid < batch_count; bid++)
         {
             void* const src = (void*)&(hE[bid * E_size]);
-            void* const dst = (void*)(E + bid * strideE);
+            void* const dst = (void*)&(E[bid * strideE]);
             size_t const sizeBytes = sizeof(S) * E_size;
             hipMemcpyKind const kind = hipMemcpyHostToDevice;
             HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
         }
     }
 
+    {
+        // ------------------------------------------------------
+        // copy linfo_array[] from host to info_array[] on device
+        // ------------------------------------------------------
+
+        void* const src = (void*)&(linfo_array[0]);
+        void* const dst = (void*)&(info_array[0]);
+        size_t const nbytes = sizeof(I) * batch_count;
+        hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+        HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+    }
+
     HIP_CHECK(hipStreamSynchronize(stream));
+    HIP_CHECK(hipDeviceSynchronize());
 
     if(idebug >= 2)
     {
@@ -2626,14 +2641,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
-    HIP_CHECK(hipHostFree(hD));
-    hD = nullptr;
-    HIP_CHECK(hipHostFree(hE));
-    hE = nullptr;
-
-    HIP_CHECK(hipFree(dwork));
-    dwork = nullptr;
-
     if(idebug >= 1)
     {
         for(auto bid = 0; bid < batch_count; bid++)
@@ -2642,19 +2649,21 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
-    {
-        // ------------------------------------------------------
-        // copy linfo_array[] from host to info_array[] on device
-        // ------------------------------------------------------
+    // ----------------------
+    // free allocated storage
+    // ----------------------
 
-        void* dst = (void*)info_array;
-        void* src = (void*)&(linfo_array[0]);
-        size_t nbytes = sizeof(I) * batch_count;
-        hipMemcpyKind const kind = hipMemcpyHostToDevice;
+    HIP_CHECK(hipHostFree(hwork));
+    hwork = nullptr;
 
-        HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-        HIP_CHECK(hipStreamSynchronize(stream));
-    }
+    HIP_CHECK(hipHostFree(hD));
+    hD = nullptr;
+
+    HIP_CHECK(hipHostFree(hE));
+    hE = nullptr;
+
+    HIP_CHECK(hipFree(dwork));
+    dwork = nullptr;
 
     HIP_CHECK(hipHostFree(linfo_array));
     linfo_array = nullptr;

From 3381870b6c729aaa3bb271d04795559be846625f Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Mon, 17 Jun 2024 16:12:56 -0400
Subject: [PATCH 08/35] initial working version

---
 library/src/auxiliary/bdsqr_host.hpp | 48 ----------------------------
 1 file changed, 48 deletions(-)

diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp
index 8ee126a9a..0ec3436b1 100644
--- a/library/src/auxiliary/bdsqr_host.hpp
+++ b/library/src/auxiliary/bdsqr_host.hpp
@@ -2390,8 +2390,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
                                                    I* splits_map,
                                                    S* work)
 {
-    int const idebug = 2;
-
     // -------------------------
     // copy D into hD, E into hE
     // -------------------------
@@ -2462,35 +2460,11 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 
     HIP_CHECK(hipStreamSynchronize(stream));
 
-    if(idebug >= 1)
-    {
-        printf("batch_count = %d\n", batch_count);
-        printf("n = %d, strideD = %ld, strideE = %ld\n", n, (int64_t)strideD, (int64_t)strideE);
-        printf("nv = %d, nu = %d, nc = %d\n", nv, nu, nc);
-    }
-    if(idebug >= 2)
-    {
-        printf("on entry\n");
-        for(auto i = 0; i < n; i++)
-        {
-            printf("hD[%d] = %le\n", i, hD[i]);
-        }
-        for(auto i = 0; i < (n - 1); i++)
-        {
-            printf("hE[%d] = %le\n", i, hE[i]);
-        }
-    }
-
     S* dwork = nullptr;
     HIP_CHECK(hipMalloc(&dwork, sizeof(S) * (4 * n)));
 
     for(I bid = 0; bid < batch_count; bid++)
     {
-        if(idebug >= 1)
-        {
-            printf("on entry: linfo_array[%d] = %d\n", bid, linfo_array[bid]);
-        }
-
         if(linfo_array[bid] != 0)
         {
             continue;
@@ -2626,28 +2600,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     }
 
     HIP_CHECK(hipStreamSynchronize(stream));
-    HIP_CHECK(hipDeviceSynchronize());
-
-    if(idebug >= 2)
-    {
-        printf("on exit\n");
-        for(auto i = 0; i < n; i++)
-        {
-            printf("hD[%d] = %le\n", i, hD[i]);
-        }
-        for(auto i = 0; i < (n - 1); i++)
-        {
-            printf("hE[%d] = %le\n", i, hE[i]);
-        }
-    }
-
-    if(idebug >= 1)
-    {
-        for(auto bid = 0; bid < batch_count; bid++)
-        {
-            printf("linfo_array[%d] = %d\n", bid, linfo_array[bid]);
-        }
-    }
 
     // ----------------------
     // free allocated storage

From 915c972a18a194b8747f16322b3136e26ec2ca94 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Mon, 17 Jun 2024 16:19:52 -0400
Subject: [PATCH 09/35] rename bdsqr_host.hpp to rocauxiliary_bdsqr_host.hpp

---
 library/src/auxiliary/rocauxiliary_bdsqr.hpp                    | 2 +-
 .../auxiliary/{bdsqr_host.hpp => rocauxiliary_bdsqr_host.hpp}   | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename library/src/auxiliary/{bdsqr_host.hpp => rocauxiliary_bdsqr_host.hpp} (100%)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
index 5718f50e0..937a52f1a 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
@@ -38,7 +38,7 @@
 
 #include <cmath>
 
-#include "bdsqr_host.hpp"
+#include "rocauxiliary_bdsqr_host.hpp"
 
 ROCSOLVER_BEGIN_NAMESPACE
 
diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
similarity index 100%
rename from library/src/auxiliary/bdsqr_host.hpp
rename to library/src/auxiliary/rocauxiliary_bdsqr_host.hpp

From d8b24bec0e6b93fc2a825a971dee7cf422baaa3e Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Tue, 18 Jun 2024 10:50:39 -0400
Subject: [PATCH 10/35] use more cache by using more thread blocks

---
 library/src/auxiliary/rocauxiliary_bdsqr_host.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 0ec3436b1..99a5340be 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -74,12 +74,13 @@ __global__ static void lasr_kernel(char const side,
     auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
     auto min = [](auto x, auto y) { return ((x < y) ? x : y); };
 
-    auto indx2f = [](auto i, auto j, auto lda) -> int64_t {
+    auto indx2f = [](auto i, auto j, auto lda) {
         assert((1 <= i));
         assert((1 <= lda));
         assert((1 <= j));
 
-        return ((i - 1) + (j - 1) * int64_t(lda));
+        // return ((i - 1) + (j - 1) * int64_t(lda));
+        return (i + j * lda - (1 + lda));
     };
 
     auto indx1f = [](auto i) -> int64_t {
@@ -529,7 +530,7 @@ static void lasr_template_gpu(char const side,
                               I const lda,
                               hipStream_t stream = 0)
 {
-    auto const nthreads = 2 * warpSize;
+    auto const nthreads = warpSize;
 
     bool const is_left_side = (side == 'L') || (side == 'l');
     auto const mn = (is_left_side) ? n : m;

From f8cf1bd3b6bb95e8d4e8449473ea70d9b7f63fe4 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Tue, 18 Jun 2024 12:07:15 -0400
Subject: [PATCH 11/35] development snapshot

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 300 ++++++++++++++----
 1 file changed, 246 insertions(+), 54 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 99a5340be..d144b31a8 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -1286,10 +1286,10 @@ static void bdsqr_single_template(char uplo,
 
                                   S* work_,
                                   I& info,
-                                  S* dwork = nullptr,
+                                  S* dwork_ = nullptr,
                                   hipStream_t stream = 0)
 {
-    bool const use_gpu = (dwork != nullptr);
+    bool const use_gpu = (dwork_ != nullptr);
 
     S const zero = 0;
     S const one = 1;
@@ -1345,14 +1345,25 @@ static void bdsqr_single_template(char uplo,
     auto call_scal_gpu
         = [=](I n, auto da, T& x, I incx) { scal_template<S, T, I>(n, da, &x, incx, stream); };
 
+    auto call_lasr_gpu_nocopy = [=](char const side, char const pivot, char const direct, I const m,
+                                    I const n, S& dc, S& ds, T& A, I const lda, hipStream_t stream) {
+        bool const is_left_side = (side == 'L') || (side == 'l');
+        auto const mn = (is_left_side) ? m : n;
+        auto const mn_m1 = (mn - 1);
+
+        lasr_template_gpu(side, pivot, direct, m, n, &dc, &ds, &A, lda, stream);
+    };
+
     auto call_lasr_gpu
         = [=](char const side, char const pivot, char const direct, I const m, I const n, S& c,
-              S& s, T& A, I const lda, S* const dwork, hipStream_t stream) {
+              S& s, T& A, I const lda, S* const dwork_, hipStream_t stream) {
               bool const is_left_side = (side == 'L') || (side == 'l');
               auto const mn = (is_left_side) ? m : n;
               auto const mn_m1 = (mn - 1);
-              S* const dc = dwork;
-              S* const ds = dwork + mn_m1;
+              S* const dc = dwork_;
+              S* const ds = dwork_ + mn_m1;
+              CHECK_HIP(hipStreamSynchronize(stream));
+
               CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
               CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
 
@@ -1378,6 +1389,7 @@ static void bdsqr_single_template(char uplo,
         return (e_[i - 1]);
     };
     auto work = [=](auto i) -> S& { return (work_[i - 1]); };
+    auto dwork = [=](auto i) -> S& { return (dwork_[i - 1]); };
 
     auto c = [=](auto i, auto j) -> T& {
         assert((1 <= i) && (i <= nrc) && (nrc <= ldc));
@@ -1435,6 +1447,10 @@ static void bdsqr_single_template(char uplo,
 
     if(n == 0)
         return;
+
+    bool const need_update_singular_vectors = (nru > 0) || (ncc > 0);
+    bool constexpr use_lasr_gpu_nocopy = true;
+
     if(n == 1)
         goto L160;
     /*
@@ -1484,9 +1500,32 @@ static void bdsqr_single_template(char uplo,
         }
     L10:
 
-        /*
-     *        update singular vectors if desired
-     */
+        //        ----------------------------------
+        //        update singular vectors if desired
+        //        ----------------------------------
+        CHECK_HIP(hipStreamSynchronize(stream));
+
+        if(need_update_singular_vectors)
+        {
+            // --------------
+            // copy rotations
+            // --------------
+            size_t const nbytes = sizeof(S) * (n - 1);
+            hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+            {
+                void* const src = (void*)&(work(1));
+                void* const dst = (void*)&(dwork(1));
+                CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+            }
+
+            {
+                void* const src = (void*)&(work(n));
+                void* const dst = (void*)&(dwork(n));
+                CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+            }
+        }
+        CHECK_HIP(hipStreamSynchronize(stream));
 
         if(nru > 0)
         {
@@ -1496,8 +1535,16 @@ static void bdsqr_single_template(char uplo,
             char direct = 'F';
             if(use_gpu)
             {
-                call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu, dwork,
-                              stream);
+                if(use_lasr_gpu_nocopy)
+                {
+                    call_lasr_gpu_nocopy(side, pivot, direct, nru, n, dwork(1), dwork(n), u(1, 1),
+                                         ldu, stream);
+                }
+                else
+                {
+                    call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu,
+                                  dwork_, stream);
+                }
             }
             else
             {
@@ -1512,8 +1559,16 @@ static void bdsqr_single_template(char uplo,
             char direct = 'F';
             if(use_gpu)
             {
-                call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc, dwork,
-                              stream);
+                if(use_lasr_gpu_nocopy)
+                {
+                    call_lasr_gpu_nocopy(side, pivot, direct, n, ncc, dwork(1), dwork(n), c(1, 1),
+                                         ldc, stream);
+                }
+                else
+                {
+                    call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc,
+                                  dwork_, stream);
+                }
             }
             else
             {
@@ -1876,9 +1931,54 @@ static void bdsqr_single_template(char uplo,
             h = d(m) * cs;
             d(m) = h * oldcs;
             e(m - 1) = h * oldsn;
-            /*
-       *           update singular vectors
-       */
+            //
+            //   -----------------------
+            //   update singular vectors
+            //   -----------------------
+
+            CHECK_HIP(hipStreamSynchronize(stream));
+
+            if(rotate)
+            {
+                // --------------
+                // copy rotations
+                // --------------
+                size_t const nbytes = sizeof(S) * (n - 1);
+                hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+                if(ncvt > 0)
+                {
+                    {
+                        void* const src = (void*)&(work(1));
+                        void* const dst = (void*)&(dwork(1));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+
+                    {
+                        void* const src = (void*)&(work(n));
+                        void* const dst = (void*)&(dwork(n));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+                }
+
+                if((nru > 0) || (ncc > 0))
+                {
+                    {
+                        void* const src = (void*)&(work(nm12));
+                        void* const dst = (void*)&(dwork(nm12));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+
+                    {
+                        void* const src = (void*)&(work(nm13));
+                        void* const dst = (void*)&(dwork(nm13));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+                }
+            }
+
+            CHECK_HIP(hipStreamSynchronize(stream));
+
             if(ncvt > 0)
             {
                 // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
@@ -1889,8 +1989,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt,
-                                  dwork, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(1), dwork(n),
+                                             vt(ll, 1), ldvt, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1),
+                                      ldvt, dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -1907,8 +2015,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
-                                  u(1, ll), ldu, dwork, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), u(1, ll), ldu, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                                      u(1, ll), ldu, dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -1926,8 +2042,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
-                                  c(ll, 1), ldc, dwork, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), c(ll, 1), ldc, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                                      c(ll, 1), ldc, dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -1935,9 +2059,9 @@ static void bdsqr_single_template(char uplo,
                               c(ll, 1), ldc);
                 }
             }
-            /*
-       *           test convergence
-       */
+            //
+            //           test convergence
+            //
             if(abs(e(m - 1)) <= thresh)
                 e(m - 1) = zero;
         }
@@ -1971,9 +2095,53 @@ static void bdsqr_single_template(char uplo,
             h = d(ll) * cs;
             d(ll) = h * oldcs;
             e(ll) = h * oldsn;
-            /*
-       *           update singular vectors
-       */
+            //
+            //           update singular vectors
+            //
+
+            CHECK_HIP(hipStreamSynchronize(stream));
+
+            if(rotate)
+            {
+                // --------------
+                // copy rotations
+                // --------------
+                size_t const nbytes = sizeof(S) * (n - 1);
+                hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+                if((nru > 0) || (ncc > 0))
+                {
+                    {
+                        void* const src = (void*)&(work(1));
+                        void* const dst = (void*)&(dwork(1));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+
+                    {
+                        void* const src = (void*)&(work(n));
+                        void* const dst = (void*)&(dwork(n));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+                }
+
+                if(ncvt > 0)
+                {
+                    {
+                        void* const src = (void*)&(work(nm12));
+                        void* const dst = (void*)&(dwork(nm12));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+
+                    {
+                        void* const src = (void*)&(work(nm13));
+                        void* const dst = (void*)&(dwork(nm13));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+                }
+            }
+
+            CHECK_HIP(hipStreamSynchronize(stream));
+
             if(ncvt > 0)
             {
                 // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
@@ -1985,8 +2153,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
-                                  vt(ll, 1), ldvt, dwork, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), vt(ll, 1), ldvt, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                                      vt(ll, 1), ldvt, dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -2005,8 +2181,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu,
-                                  dwork, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(1), dwork(n),
+                                             u(1, ll), ldu, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu,
+                                      dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -2024,32 +2208,40 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc,
-                                  dwork, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(1), dwork(n),
+                                             c(ll, 1), ldc, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc,
+                                      dwork_, stream);
+                    }
                 }
                 else
                 {
                     call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc);
                 }
             }
-            /*
-       *           test convergence
-       */
+            //
+            //           test convergence
+            //
             if(abs(e(ll)) <= thresh)
                 e(ll) = zero;
         }
     }
     else
     {
-        /*
-     *        use nonzero shift
-     */
+        //
+        //        use nonzero shift
+        //
         if(idir == 1)
         {
-            /*
-       *           chase bulge from top to bottom
-       *           save cosines and sines for later singular vector updates
-       */
+            //
+            //           chase bulge from top to bottom
+            //           save cosines and sines for later singular vector updates
+            //
             f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll));
             g = e(ll);
             // do 140 i = ll, m - 1
@@ -2092,7 +2284,7 @@ static void bdsqr_single_template(char uplo,
                 if(use_gpu)
                 {
                     call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt,
-                                  dwork, stream);
+                                  dwork_, stream);
                 }
                 else
                 {
@@ -2111,7 +2303,7 @@ static void bdsqr_single_template(char uplo,
                 if(use_gpu)
                 {
                     call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
-                                  u(1, ll), ldu, dwork, stream);
+                                  u(1, ll), ldu, dwork_, stream);
                 }
                 else
                 {
@@ -2130,7 +2322,7 @@ static void bdsqr_single_template(char uplo,
                 if(use_gpu)
                 {
                     call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
-                                  c(ll, 1), ldc, dwork, stream);
+                                  c(ll, 1), ldc, dwork_, stream);
                 }
                 else
                 {
@@ -2198,7 +2390,7 @@ static void bdsqr_single_template(char uplo,
                 if(use_gpu)
                 {
                     call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
-                                  vt(ll, 1), ldvt, dwork, stream);
+                                  vt(ll, 1), ldvt, dwork_, stream);
                 }
                 else
                 {
@@ -2218,7 +2410,7 @@ static void bdsqr_single_template(char uplo,
                 if(use_gpu)
                 {
                     call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu,
-                                  dwork, stream);
+                                  dwork_, stream);
                 }
                 else
                 {
@@ -2237,7 +2429,7 @@ static void bdsqr_single_template(char uplo,
                 if(use_gpu)
                 {
                     call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc,
-                                  dwork, stream);
+                                  dwork_, stream);
                 }
                 else
                 {
@@ -2461,8 +2653,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 
     HIP_CHECK(hipStreamSynchronize(stream));
 
-    S* dwork = nullptr;
-    HIP_CHECK(hipMalloc(&dwork, sizeof(S) * (4 * n)));
+    S* dwork_ = nullptr;
+    HIP_CHECK(hipMalloc(&dwork_, sizeof(S) * (4 * n)));
 
     for(I bid = 0; bid < batch_count; bid++)
     {
@@ -2530,7 +2722,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 
                                            vt_, ldvt, u_, ldu, c_, ldc,
 
-                                           work_, info, dwork, stream);
+                                           work_, info, dwork_, stream);
         }
 
         if(info == 0)
@@ -2615,8 +2807,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     HIP_CHECK(hipHostFree(hE));
     hE = nullptr;
 
-    HIP_CHECK(hipFree(dwork));
-    dwork = nullptr;
+    HIP_CHECK(hipFree(dwork_));
+    dwork_ = nullptr;
 
     HIP_CHECK(hipHostFree(linfo_array));
     linfo_array = nullptr;

From 5ddbbf85c124f48849c8267b44d987604a3076ea Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Tue, 18 Jun 2024 12:15:35 -0400
Subject: [PATCH 12/35] development snapshot

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 86 +++++++++++++++++--
 1 file changed, 77 insertions(+), 9 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index d144b31a8..48f12dc8c 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -2270,9 +2270,53 @@ static void bdsqr_single_template(char uplo,
             }
         L140:
             e(m - 1) = f;
-            /*
-       *           update singular vectors
-       */
+            //
+            //           update singular vectors
+            //
+
+            CHECK_HIP(hipStreamSynchronize(stream));
+
+            if(rotate)
+            {
+                // --------------
+                // copy rotations
+                // --------------
+                size_t const nbytes = sizeof(S) * (n - 1);
+                hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+                if(ncvt > 0)
+                {
+                    {
+                        void* const src = (void*)&(work(1));
+                        void* const dst = (void*)&(dwork(1));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+
+                    {
+                        void* const src = (void*)&(work(n));
+                        void* const dst = (void*)&(dwork(n));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+                }
+
+                if((nru > 0) || (ncc > 0))
+                {
+                    {
+                        void* const src = (void*)&(work(nm12));
+                        void* const dst = (void*)&(dwork(nm12));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+
+                    {
+                        void* const src = (void*)&(work(nm13));
+                        void* const dst = (void*)&(dwork(nm13));
+                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                    }
+                }
+            }
+
+            CHECK_HIP(hipStreamSynchronize(stream));
+
             if(ncvt > 0)
             {
                 // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
@@ -2283,8 +2327,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt,
-                                  dwork_, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(1), dwork(n),
+                                             vt(ll, 1), ldvt, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1),
+                                      ldvt, dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -2302,8 +2354,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
-                                  u(1, ll), ldu, dwork_, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), u(1, ll), ldu, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                                      u(1, ll), ldu, dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -2321,8 +2381,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
-                                  c(ll, 1), ldc, dwork_, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), c(ll, 1), ldc, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                                      c(ll, 1), ldc, dwork_, stream);
+                    }
                 }
                 else
                 {

From 129c3a6e78adfaac69be99f57049f7d9c8437d7e Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Tue, 18 Jun 2024 13:45:23 -0400
Subject: [PATCH 13/35] development snapshot

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 364 +++++++++++-------
 1 file changed, 224 insertions(+), 140 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 48f12dc8c..316834dbd 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -1503,29 +1503,33 @@ static void bdsqr_single_template(char uplo,
         //        ----------------------------------
         //        update singular vectors if desired
         //        ----------------------------------
-        CHECK_HIP(hipStreamSynchronize(stream));
 
-        if(need_update_singular_vectors)
+        if(use_lasr_gpu_nocopy)
         {
-            // --------------
-            // copy rotations
-            // --------------
-            size_t const nbytes = sizeof(S) * (n - 1);
-            hipMemcpyKind const kind = hipMemcpyHostToDevice;
+            CHECK_HIP(hipStreamSynchronize(stream));
 
+            if(need_update_singular_vectors)
             {
-                void* const src = (void*)&(work(1));
-                void* const dst = (void*)&(dwork(1));
-                CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-            }
+                // --------------
+                // copy rotations
+                // --------------
+                size_t const nbytes = sizeof(S) * (n - 1);
+                hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
-            {
-                void* const src = (void*)&(work(n));
-                void* const dst = (void*)&(dwork(n));
-                CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                {
+                    void* const src = (void*)&(work(1));
+                    void* const dst = (void*)&(dwork(1));
+                    CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                }
+
+                {
+                    void* const src = (void*)&(work(n));
+                    void* const dst = (void*)&(dwork(n));
+                    CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                }
             }
+            CHECK_HIP(hipStreamSynchronize(stream));
         }
-        CHECK_HIP(hipStreamSynchronize(stream));
 
         if(nru > 0)
         {
@@ -1936,48 +1940,51 @@ static void bdsqr_single_template(char uplo,
             //   update singular vectors
             //   -----------------------
 
-            CHECK_HIP(hipStreamSynchronize(stream));
-
-            if(rotate)
+            if(use_lasr_gpu_nocopy)
             {
-                // --------------
-                // copy rotations
-                // --------------
-                size_t const nbytes = sizeof(S) * (n - 1);
-                hipMemcpyKind const kind = hipMemcpyHostToDevice;
+                CHECK_HIP(hipStreamSynchronize(stream));
 
-                if(ncvt > 0)
+                if(rotate)
                 {
-                    {
-                        void* const src = (void*)&(work(1));
-                        void* const dst = (void*)&(dwork(1));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-                    }
+                    // --------------
+                    // copy rotations
+                    // --------------
+                    size_t const nbytes = sizeof(S) * (n - 1);
+                    hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
+                    if(ncvt > 0)
                     {
-                        void* const src = (void*)&(work(n));
-                        void* const dst = (void*)&(dwork(n));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        {
+                            void* const src = (void*)&(work(1));
+                            void* const dst = (void*)&(dwork(1));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(n));
+                            void* const dst = (void*)&(dwork(n));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
                     }
-                }
 
-                if((nru > 0) || (ncc > 0))
-                {
+                    if((nru > 0) || (ncc > 0))
                     {
-                        void* const src = (void*)&(work(nm12));
-                        void* const dst = (void*)&(dwork(nm12));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-                    }
-
-                    {
-                        void* const src = (void*)&(work(nm13));
-                        void* const dst = (void*)&(dwork(nm13));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        {
+                            void* const src = (void*)&(work(nm12));
+                            void* const dst = (void*)&(dwork(nm12));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(nm13));
+                            void* const dst = (void*)&(dwork(nm13));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
                     }
                 }
-            }
 
-            CHECK_HIP(hipStreamSynchronize(stream));
+                CHECK_HIP(hipStreamSynchronize(stream));
+            }
 
             if(ncvt > 0)
             {
@@ -2099,48 +2106,51 @@ static void bdsqr_single_template(char uplo,
             //           update singular vectors
             //
 
-            CHECK_HIP(hipStreamSynchronize(stream));
-
-            if(rotate)
+            if(use_lasr_gpu_nocopy)
             {
-                // --------------
-                // copy rotations
-                // --------------
-                size_t const nbytes = sizeof(S) * (n - 1);
-                hipMemcpyKind const kind = hipMemcpyHostToDevice;
+                CHECK_HIP(hipStreamSynchronize(stream));
 
-                if((nru > 0) || (ncc > 0))
+                if(rotate)
                 {
-                    {
-                        void* const src = (void*)&(work(1));
-                        void* const dst = (void*)&(dwork(1));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-                    }
-
-                    {
-                        void* const src = (void*)&(work(n));
-                        void* const dst = (void*)&(dwork(n));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-                    }
-                }
+                    // --------------
+                    // copy rotations
+                    // --------------
+                    size_t const nbytes = sizeof(S) * (n - 1);
+                    hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
-                if(ncvt > 0)
-                {
+                    if((nru > 0) || (ncc > 0))
                     {
-                        void* const src = (void*)&(work(nm12));
-                        void* const dst = (void*)&(dwork(nm12));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        {
+                            void* const src = (void*)&(work(1));
+                            void* const dst = (void*)&(dwork(1));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(n));
+                            void* const dst = (void*)&(dwork(n));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
                     }
 
+                    if(ncvt > 0)
                     {
-                        void* const src = (void*)&(work(nm13));
-                        void* const dst = (void*)&(dwork(nm13));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        {
+                            void* const src = (void*)&(work(nm12));
+                            void* const dst = (void*)&(dwork(nm12));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(nm13));
+                            void* const dst = (void*)&(dwork(nm13));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
                     }
                 }
-            }
 
-            CHECK_HIP(hipStreamSynchronize(stream));
+                CHECK_HIP(hipStreamSynchronize(stream));
+            }
 
             if(ncvt > 0)
             {
@@ -2274,48 +2284,51 @@ static void bdsqr_single_template(char uplo,
             //           update singular vectors
             //
 
-            CHECK_HIP(hipStreamSynchronize(stream));
-
-            if(rotate)
+            if(use_lasr_gpu_nocopy)
             {
-                // --------------
-                // copy rotations
-                // --------------
-                size_t const nbytes = sizeof(S) * (n - 1);
-                hipMemcpyKind const kind = hipMemcpyHostToDevice;
+                CHECK_HIP(hipStreamSynchronize(stream));
 
-                if(ncvt > 0)
+                if(rotate)
                 {
-                    {
-                        void* const src = (void*)&(work(1));
-                        void* const dst = (void*)&(dwork(1));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-                    }
+                    // --------------
+                    // copy rotations
+                    // --------------
+                    size_t const nbytes = sizeof(S) * (n - 1);
+                    hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
+                    if(ncvt > 0)
                     {
-                        void* const src = (void*)&(work(n));
-                        void* const dst = (void*)&(dwork(n));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        {
+                            void* const src = (void*)&(work(1));
+                            void* const dst = (void*)&(dwork(1));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(n));
+                            void* const dst = (void*)&(dwork(n));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
                     }
-                }
 
-                if((nru > 0) || (ncc > 0))
-                {
+                    if((nru > 0) || (ncc > 0))
                     {
-                        void* const src = (void*)&(work(nm12));
-                        void* const dst = (void*)&(dwork(nm12));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
-                    }
-
-                    {
-                        void* const src = (void*)&(work(nm13));
-                        void* const dst = (void*)&(dwork(nm13));
-                        CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        {
+                            void* const src = (void*)&(work(nm12));
+                            void* const dst = (void*)&(dwork(nm12));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(nm13));
+                            void* const dst = (void*)&(dwork(nm13));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
                     }
                 }
-            }
 
-            CHECK_HIP(hipStreamSynchronize(stream));
+                CHECK_HIP(hipStreamSynchronize(stream));
+            }
 
             if(ncvt > 0)
             {
@@ -2438,14 +2451,61 @@ static void bdsqr_single_template(char uplo,
             }
         L150:
             e(ll) = f;
-            /*
-       *           test convergence
-       */
+            //
+            //           test convergence
+            //
             if(abs(e(ll)) <= thresh)
                 e(ll) = zero;
-            /*
-       *           update singular vectors if desired
-       */
+            //
+            //           update singular vectors if desired
+            //
+
+            if(use_lasr_gpu_nocopy)
+            {
+                CHECK_HIP(hipStreamSynchronize(stream));
+
+                if(rotate)
+                {
+                    // --------------
+                    // copy rotations
+                    // --------------
+                    size_t const nbytes = sizeof(S) * (n - 1);
+                    hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+                    if((nru > 0) || (ncc > 0))
+                    {
+                        {
+                            void* const src = (void*)&(work(1));
+                            void* const dst = (void*)&(dwork(1));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(n));
+                            void* const dst = (void*)&(dwork(n));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+
+                    if(ncvt > 0)
+                    {
+                        {
+                            void* const src = (void*)&(work(nm12));
+                            void* const dst = (void*)&(dwork(nm12));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(nm13));
+                            void* const dst = (void*)&(dwork(nm13));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+                }
+
+                CHECK_HIP(hipStreamSynchronize(stream));
+            }
+
             if(ncvt > 0)
             {
                 // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
@@ -2457,8 +2517,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
-                                  vt(ll, 1), ldvt, dwork_, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), vt(ll, 1), ldvt, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                                      vt(ll, 1), ldvt, dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -2477,8 +2545,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu,
-                                  dwork_, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(1), dwork(n),
+                                             u(1, ll), ldu, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu,
+                                      dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -2496,8 +2572,16 @@ static void bdsqr_single_template(char uplo,
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
-                    call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc,
-                                  dwork_, stream);
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(1), dwork(n),
+                                             c(ll, 1), ldc, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc,
+                                      dwork_, stream);
+                    }
                 }
                 else
                 {
@@ -2521,9 +2605,9 @@ static void bdsqr_single_template(char uplo,
         if(d(i) < zero)
         {
             d(i) = -d(i);
-            /*
-       *           change sign of singular vectors, if desired
-       */
+            //
+            //           change sign of singular vectors, if desired
+            //
             if(ncvt > 0)
             {
                 if(use_gpu)
@@ -2538,16 +2622,16 @@ static void bdsqr_single_template(char uplo,
         }
     }
 L170:
-    /*
-   *     sort the singular values into decreasing order (insertion sort on
-   *     singular values, but only one transposition per singular vector)
-   */
+    //
+    //     sort the singular values into decreasing order (insertion sort on
+    //     singular values, but only one transposition per singular vector)
+    //
     // do 190 i = 1, n - 1
     for(i = 1; i <= (n - 1); i++)
     {
-        /*
-     *        scan for smallest d(i)
-     */
+        //
+        //        scan for smallest d(i)
+        //
         isub = 1;
         smin = d(1);
         // do 180 j = 2, n + 1 - i
@@ -2562,9 +2646,9 @@ static void bdsqr_single_template(char uplo,
     L180:
         if(isub != n + 1 - i)
         {
-            /*
-       *           swap singular values and vectors
-       */
+            //
+            //           swap singular values and vectors
+            //
             d(isub) = d(n + 1 - i);
             d(n + 1 - i) = smin;
             if(ncvt > 0)
@@ -2604,9 +2688,9 @@ static void bdsqr_single_template(char uplo,
     }
 L190:
     goto L220;
-/*
- *     maximum number of iterations exceeded, failure to converge
- */
+//
+//     maximum number of iterations exceeded, failure to converge
+//
 L200:
     info = 0;
     // do 210 i = 1, n - 1
@@ -2618,9 +2702,9 @@ static void bdsqr_single_template(char uplo,
 L210:
 L220:
     return;
-    /*
-   *     end of dbdsqr
-   */
+    //
+    //     end of dbdsqr
+    //
 }
 
 template <typename T, typename S, typename W1, typename W2, typename W3, typename I = rocblas_int>

From f02b735e5c77370004ddcd7fcc4f9c23f2f3eb41 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Fri, 21 Jun 2024 11:57:09 -0400
Subject: [PATCH 14/35] handle batch case with array of pointers on device

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 177 +++++++++++++-----
 1 file changed, 132 insertions(+), 45 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 316834dbd..34a21a212 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -1290,6 +1290,7 @@ static void bdsqr_single_template(char uplo,
                                   hipStream_t stream = 0)
 {
     bool const use_gpu = (dwork_ != nullptr);
+    bool constexpr need_sort = false;
 
     S const zero = 0;
     S const one = 1;
@@ -1449,7 +1450,7 @@ static void bdsqr_single_template(char uplo,
         return;
 
     bool const need_update_singular_vectors = (nru > 0) || (ncc > 0);
-    bool constexpr use_lasr_gpu_nocopy = true;
+    bool constexpr use_lasr_gpu_nocopy = false;
 
     if(n == 1)
         goto L160;
@@ -2590,6 +2591,7 @@ static void bdsqr_single_template(char uplo,
             }
         }
     }
+    CHECK_HIP(hipStreamSynchronize(stream));
     /*
    *     qr iteration finished, go back and check convergence
    */
@@ -2627,61 +2629,64 @@ static void bdsqr_single_template(char uplo,
     //     singular values, but only one transposition per singular vector)
     //
     // do 190 i = 1, n - 1
-    for(i = 1; i <= (n - 1); i++)
+    if(need_sort)
     {
-        //
-        //        scan for smallest d(i)
-        //
-        isub = 1;
-        smin = d(1);
-        // do 180 j = 2, n + 1 - i
-        for(j = 2; j <= (n + 1 - i); j++)
-        {
-            if(d(j) <= smin)
-            {
-                isub = j;
-                smin = d(j);
-            }
-        }
-    L180:
-        if(isub != n + 1 - i)
+        for(i = 1; i <= (n - 1); i++)
         {
             //
-            //           swap singular values and vectors
+            //        scan for smallest d(i)
             //
-            d(isub) = d(n + 1 - i);
-            d(n + 1 - i) = smin;
-            if(ncvt > 0)
+            isub = 1;
+            smin = d(1);
+            // do 180 j = 2, n + 1 - i
+            for(j = 2; j <= (n + 1 - i); j++)
             {
-                if(use_gpu)
-                {
-                    call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
-                }
-                else
+                if(d(j) <= smin)
                 {
-                    call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+                    isub = j;
+                    smin = d(j);
                 }
             }
-            if(nru > 0)
+        L180:
+            if(isub != n + 1 - i)
             {
-                if(use_gpu)
-                {
-                    call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
-                }
-                else
+                //
+                //           swap singular values and vectors
+                //
+                d(isub) = d(n + 1 - i);
+                d(n + 1 - i) = smin;
+                if(ncvt > 0)
                 {
-                    call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+                    }
+                    else
+                    {
+                        call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+                    }
                 }
-            }
-            if(ncc > 0)
-            {
-                if(use_gpu)
+                if(nru > 0)
                 {
-                    call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
+                    }
+                    else
+                    {
+                        call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
+                    }
                 }
-                else
+                if(ncc > 0)
                 {
-                    call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+                    }
+                    else
+                    {
+                        call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+                    }
                 }
             }
         }
@@ -2718,15 +2723,15 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
                                                    const rocblas_stride strideD,
                                                    S* E,
                                                    const rocblas_stride strideE,
-                                                   W1 V,
+                                                   W1 V_arg,
                                                    const I shiftV,
                                                    const I ldv,
                                                    const rocblas_stride strideV,
-                                                   W2 U,
+                                                   W2 U_arg,
                                                    const I shiftU,
                                                    const I ldu,
                                                    const rocblas_stride strideU,
-                                                   W3 C,
+                                                   W3 C_arg,
                                                    const I shiftC,
                                                    const I ldc,
                                                    const rocblas_stride strideC,
@@ -2742,6 +2747,88 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
+    W1 V = V_arg;
+    W2 U = U_arg;
+    W3 C = C_arg;
+
+    auto is_device_pointer = [](void* ptr) -> bool {
+        hipPointerAttribute_t dev_attributes;
+        if(ptr == nullptr)
+        {
+            return (false);
+        }
+
+        auto istat = hipPointerGetAttributes(&dev_attributes, ptr);
+        if(istat != hipSuccess)
+        {
+            std::cout << "is_device_pointer: istat = " << istat << " " << hipGetErrorName(istat)
+                      << std::endl;
+        }
+        assert(istat == hipSuccess);
+        return (dev_attributes.type == hipMemoryTypeDevice);
+    };
+
+    // ---------------------------------------------------
+    // handle  batch case with array of pointers on device
+    // ---------------------------------------------------
+    std::vector<T*> Vp_array(batch_count);
+    std::vector<T*> Up_array(batch_count);
+    std::vector<T*> Cp_array(batch_count);
+
+    if(nv > 0)
+    {
+        bool const is_device_V_arg = is_device_pointer((void*)V_arg);
+        if(is_device_V_arg)
+        {
+            bool constexpr need_copy_W1 = !std::is_same<W1, T*>::value;
+            if constexpr(need_copy_W1)
+            {
+                size_t const nbytes = sizeof(T*) * batch_count;
+                void* const dst = (void*)&(Vp_array[0]);
+                void* const src = (void*)V_arg;
+                HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
+                V = &(Vp_array[0]);
+            }
+        }
+    }
+
+    if(nu > 0)
+    {
+        bool const is_device_U_arg = is_device_pointer((void*)U_arg);
+        if(is_device_U_arg)
+        {
+            bool constexpr need_copy_W2 = !std::is_same<W2, T*>::value;
+            if constexpr(need_copy_W2)
+            {
+                size_t const nbytes = sizeof(T*) * batch_count;
+                void* const dst = (void*)&(Up_array[0]);
+                void* const src = (void*)U_arg;
+                HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
+                U = &(Up_array[0]);
+            }
+        }
+    }
+
+    if(nc > 0)
+    {
+        bool const is_device_C_arg = is_device_pointer((void*)C_arg);
+        if(is_device_C_arg)
+        {
+            bool constexpr need_copy_W3 = !std::is_same<W3, T*>::value;
+            if constexpr(need_copy_W3)
+            {
+                size_t const nbytes = sizeof(T*) * batch_count;
+                void* const dst = (void*)&(Cp_array[0]);
+                void* const src = (void*)C_arg;
+                HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
+                C = &(Cp_array[0]);
+            }
+        }
+    }
+
     S* hD = nullptr;
     S* hE = nullptr;
 

From 7e2dc7e7e3f8fe0b43c96cd72e494e8f04ead428 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Sat, 22 Jun 2024 10:21:11 -0400
Subject: [PATCH 15/35] minor update

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 36 +++++++++++++++----
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 34a21a212..7438bfe9d 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -1290,8 +1290,21 @@ static void bdsqr_single_template(char uplo,
                                   hipStream_t stream = 0)
 {
     bool const use_gpu = (dwork_ != nullptr);
+
+    // -----------------------------------
+    // Lapack code used O(n^2) algorithm for sorting
+    // Consider turning off this and rely on
+    // bdsqr_sort() to perform sorting
+    // -----------------------------------
     bool constexpr need_sort = false;
 
+    // ---------------------------------------------------
+    // NOTE: lasq1 may return non-zero info value that
+    // has a different meaning
+    // Consider turning off lasq1 to have consistent info value
+    // ---------------------------------------------------
+    bool constexpr use_lasq1 = false;
+
     S const zero = 0;
     S const one = 1;
     S negone = -1;
@@ -1372,7 +1385,7 @@ static void bdsqr_single_template(char uplo,
               CHECK_HIP(hipStreamSynchronize(stream));
           };
 
-    auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); };
+    auto abs = [](auto x) { return (std::abs(x)); };
 
     auto indx2f = [](auto i, auto j, auto ld) -> int64_t {
         assert((1 <= i) && (i <= ld));
@@ -1457,7 +1470,7 @@ static void bdsqr_single_template(char uplo,
     /*
    *     if no singular vectors desired, use qd algorithm
    */
-    if(!rotate)
+    if((!rotate) && (use_lasq1))
     {
         call_lasq1(n, d(1), e(1), work(1), info);
         /*
@@ -1640,7 +1653,7 @@ static void bdsqr_single_template(char uplo,
     L50:
 
         sminoa = sminoa / sqrt(dble(n));
-        thresh = max(tol * sminoa, maxitr * (n * (n * unfl)));
+        thresh = max(tol * sminoa, ((unfl * n) * n) * maxitr);
     }
     else
     {
@@ -1648,7 +1661,7 @@ static void bdsqr_single_template(char uplo,
      *        absolute accuracy desired
      */
 
-        thresh = max(abs(tol) * smax, maxitr * (n * (n * unfl)));
+        thresh = max(abs(tol) * smax, ((unfl * n) * n) * maxitr);
     }
     /*
    *     prepare for main iteration loop for the singular values
@@ -2780,7 +2793,12 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         bool const is_device_V_arg = is_device_pointer((void*)V_arg);
         if(is_device_V_arg)
         {
-            bool constexpr need_copy_W1 = !std::is_same<W1, T*>::value;
+            // ------------------------------------------------------------
+            // note "T *" and "T * const" may be considered different types
+            // ------------------------------------------------------------
+            bool constexpr is_array_of_device_pointers
+                = !(std::is_same<W1, T*>::value || std::is_same<W1, T* const>::value);
+            bool constexpr need_copy_W1 = is_array_of_device_pointers;
             if constexpr(need_copy_W1)
             {
                 size_t const nbytes = sizeof(T*) * batch_count;
@@ -2798,7 +2816,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         bool const is_device_U_arg = is_device_pointer((void*)U_arg);
         if(is_device_U_arg)
         {
-            bool constexpr need_copy_W2 = !std::is_same<W2, T*>::value;
+            bool constexpr is_array_of_device_pointers
+                = !(std::is_same<W2, T*>::value || std::is_same<W2, T* const>::value);
+            bool constexpr need_copy_W2 = is_array_of_device_pointers;
             if constexpr(need_copy_W2)
             {
                 size_t const nbytes = sizeof(T*) * batch_count;
@@ -2816,7 +2836,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         bool const is_device_C_arg = is_device_pointer((void*)C_arg);
         if(is_device_C_arg)
         {
-            bool constexpr need_copy_W3 = !std::is_same<W3, T*>::value;
+            bool constexpr is_array_of_device_pointers
+                = !(std::is_same<W3, T*>::value || std::is_same<W3, T* const>::value);
+            bool constexpr need_copy_W3 = is_array_of_device_pointers;
             if constexpr(need_copy_W3)
             {
                 size_t const nbytes = sizeof(T*) * batch_count;

From 67dabb17b1ee29a33f5ed2feab8ab526bab7d1a3 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Tue, 25 Jun 2024 21:00:20 -0400
Subject: [PATCH 16/35] option to turn off lapack bdsqr

---
 library/src/auxiliary/rocauxiliary_bdsqr_host.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 7438bfe9d..81fa0fd47 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -2953,7 +2953,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         I nrv = n;
         I ncvt = nv;
         bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0);
-        if(values_only)
+        bool const use_lapack_bdsqr = false;
+
+        if((use_lapack_bdsqr) && (values_only))
         {
             // --------------------------------
             // call the lapack version of bdsqr
@@ -2990,6 +2992,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         {
             // ----------------------------
             // explicitly zero out "E" array
+            // to be compatible with rocsolver bdsqr
             // ----------------------------
             S const zero = S(0);
             for(I i = 0; i < (n - 1); i++)

From 764eeb031bcfa33d17bc29ecb6e9347c1b6bc897 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Thu, 27 Jun 2024 11:35:54 -0400
Subject: [PATCH 17/35] use c++ version of lamch

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 44 ++++++++++++++-----
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 81fa0fd47..3b2d19469 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -52,16 +52,20 @@ ROCSOLVER_BEGIN_NAMESPACE
     }
 #endif
 
+#ifndef LASR_MAX_NTHREADS
+#define LASR_MAX_NTHREADS 64
+#endif
+
 template <typename S, typename T, typename I>
-__global__ static void lasr_kernel(char const side,
-                                   char const pivot,
-                                   char const direct,
-                                   I const m,
-                                   I const n,
-                                   S const* const c_,
-                                   S const* const s_,
-                                   T* const A_,
-                                   I const lda)
+__global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char const side,
+                                                                        char const pivot,
+                                                                        char const direct,
+                                                                        I const m,
+                                                                        I const n,
+                                                                        S const* const c_,
+                                                                        S const* const s_,
+                                                                        T* const A_,
+                                                                        I const lda)
 {
     const auto nblocks = hipGridDim_x;
     const auto nthreads_per_block = hipBlockDim_x;
@@ -530,7 +534,7 @@ static void lasr_template_gpu(char const side,
                               I const lda,
                               hipStream_t stream = 0)
 {
-    auto const nthreads = warpSize;
+    auto const nthreads = LASR_MAX_NTHREADS;
 
     bool const is_left_side = (side == 'L') || (side == 'l');
     auto const mn = (is_left_side) ? n : m;
@@ -967,6 +971,7 @@ static void call_bdsqr(char& uplo,
     dbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info);
 }
 
+#ifdef USE_LAPACK
 static void call_lamch(char& cmach_arg, double& eps)
 {
     char cmach = cmach_arg;
@@ -978,6 +983,23 @@ static void call_lamch(char& cmach_arg, float& eps)
     char cmach = cmach_arg;
     eps = slamch_(&cmach);
 }
+#else
+
+static void call_lamch(char& cmach, double& eps)
+{
+    eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<double>::epsilon()
+        : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<double>::min()
+                                             : std::numeric_limits<double>::min();
+}
+
+static void call_lamch(char& cmach, float& eps)
+{
+    eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<float>::epsilon()
+        : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<float>::min()
+                                             : std::numeric_limits<float>::min();
+}
+
+#endif
 
 static void call_swap(int& n,
                       rocblas_complex_num<float>& zx,
@@ -3081,3 +3103,5 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 }
 
 ROCSOLVER_END_NAMESPACE
+#undef LASR_MAX_NTHREADS
+#undef CHECK_HIP

From 369c15d2f05d3e2285cbf8b8390fe2da1f64e0c1 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Thu, 27 Jun 2024 15:30:09 -0400
Subject: [PATCH 18/35] c++ version of las2

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 340 ++++++++++++++++++
 1 file changed, 340 insertions(+)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 3b2d19469..db62622db 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -1039,6 +1039,7 @@ static void call_swap(int& n, double& zx, int& incx, double& zy, int& incy)
     dswap_(&n, &zx, &incx, &zy, &incy);
 }
 
+#ifdef USE_LAPACK
 static void call_las2(double& f, double& g, double& h, double& ssmin, double& ssmax)
 {
     dlas2_(&f, &g, &h, &ssmin, &ssmax);
@@ -1048,6 +1049,80 @@ static void call_las2(float& f, float& g, float& h, float& ssmin, float& ssmax)
 {
     slas2_(&f, &g, &h, &ssmin, &ssmax);
 }
+#else
+
+template <typename T>
+static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax)
+{
+    T const zero = 0;
+    T const one = 1;
+    T const two = 2;
+
+    T as, at, au, c, fa, fhmn, fhmx, ga, ha;
+
+    auto abs = [](auto x) { return (std::abs(x)); };
+    auto min = [](auto x, auto y) { return ((x < y) ? x : y); };
+    auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
+    auto sqrt = [](auto x) { return (std::sqrt(x)); };
+    auto square = [](auto x) { return (x * x); };
+
+    fa = abs(f);
+    ga = abs(g);
+    ha = abs(h);
+    fhmn = min(fa, ha);
+    fhmx = max(fa, ha);
+    if(fhmn == zero)
+    {
+        ssmin = zero;
+        if(fhmx == zero)
+        {
+            ssmax = ga;
+        }
+        else
+        {
+            // ssmax = max( fhmx, ga )*sqrt( one+ ( min( fhmx, ga ) / max( fhmx, ga ) )**2 );
+            ssmax = max(fhmx, ga) * sqrt(one + square(min(fhmx, ga) / max(fhmx, ga)));
+        }
+    }
+    else
+    {
+        if(ga < fhmx)
+        {
+            as = one + fhmn / fhmx;
+            at = (fhmx - fhmn) / fhmx;
+            au = square(ga / fhmx);
+            c = two / (sqrt(as * as + au) + sqrt(at * at + au));
+            ssmin = fhmn * c;
+            ssmax = fhmx / c;
+        }
+        else
+        {
+            au = fhmx / ga;
+            if(au == zero)
+            {
+                //
+                //               avoid possible harmful underflow if exponent range
+                //               asymmetric (true ssmin may not underflow even if
+                //               au underflows)
+                //
+                ssmin = (fhmn * fhmx) / ga;
+                ssmax = ga;
+            }
+            else
+            {
+                as = one + fhmn / fhmx;
+                at = (fhmx - fhmn) / fhmx;
+                // c = one / ( sqrt( one+( as*au )**2 )+ sqrt( one+( at*au )**2 ) );
+                c = one / (sqrt(one + square(as * au)) + sqrt(one + square(at * au)));
+                ssmin = (fhmn * c) * au;
+                ssmin = ssmin + ssmin;
+                ssmax = ga / (c + c);
+            }
+        }
+    }
+}
+
+#endif
 
 static void call_lartg(double& f, double& g, double& c, double& s, double& r)
 {
@@ -1077,6 +1152,7 @@ static void call_lartg(std::complex<double>& f,
     zlartg_(&f, &g, &c, &s, &r);
 }
 
+#ifdef USE_LAPACK
 static void call_scal(int& n, rocblas_complex_num<float>& da, rocblas_complex_num<float>& zx, int& incx)
 {
     cscal_(&n, (std::complex<float>*)&da, (std::complex<float>*)&zx, &incx);
@@ -1127,7 +1203,29 @@ static void call_scal(int& n, float& da, float& zx, int& incx)
 {
     sscal_(&n, &da, &zx, &incx);
 }
+#else
+template <typename T, typename S, typename I>
+static void call_scal(I& n, S& a, T& x_in, I& incx)
+{
+    bool const is_zero = (a == 0);
+    T* const x = &x_in;
+    for(I i = 0; i < n; i++)
+    {
+        auto const ip = i * incx;
+        if(is_zero)
+        {
+            x[ip] = 0;
+        }
+        else
+        {
+            x[ip] *= a;
+        }
+    };
+}
 
+#endif
+
+#ifdef USE_LAPACK
 static void call_rot(int& n,
                      std::complex<float>& zx,
                      int& incx,
@@ -1181,7 +1279,28 @@ static void call_rot(int& n, float& dx, int& incx, float& dy, int& incy, float&
 {
     srot_(&n, &dx, &incx, &dy, &incy, &c, &s);
 }
+#else
+
+template <typename T, typename S, typename I>
+static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s)
+{
+    T* const x = &(x_in);
+    T* const y = &(y_in);
+
+    for(I i = 0; i < n; i++)
+    {
+        auto const ix = i * incx;
+        auto const iy = i * incy;
+
+        auto const temp = c * x[ix] + s * y[iy];
+        y[iy] = c * y[iy] - s * x[ix];
+        x[ix] = temp;
+    }
+}
+
+#endif
 
+#ifdef USE_LAPACK
 static void call_lasv2(double& f,
                        double& g,
                        double& h,
@@ -1207,6 +1326,227 @@ static void call_lasv2(float& f,
 {
     slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl);
 }
+#else
+// --------------------------------------------------------
+// lasv2 computes the singular value decomposition of a 2 x 2
+// triangular matrix
+// [ F G ]
+// [ 0 H ]
+//
+// on return,
+// abs(ssmax) is the larger singular value,
+// abs(ssmin) is the smaller singular value,
+// (csl,snl) and (csr,snr) are the left and right
+// singular vectors for abs(ssmax)
+//
+// [ csl  snl]  [  F  G ]  [ csr   -snr] = [ ssmax   0    ]
+// [-snl  csl]  [  0  H ]  [ snr    csr]   [  0     ssmin ]
+// --------------------------------------------------------
+template <typename T>
+static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& snl, T& csl)
+{
+    T const zero = 0;
+    T const one = 1;
+    T const two = 2;
+    T const four = 4;
+    T const half = one / two;
+
+    bool gasmal;
+    bool swap;
+    int pmax;
+    char cmach;
+
+    T a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m;
+    T mm, r, s, slt, srt, t, temp, tsign, tt;
+    T macheps;
+
+    auto abs = [](auto x) { return (std::abs(x)); };
+    auto sqrt = [](auto x) { return (std::sqrt(x)); };
+    auto sign = [](auto a, auto b) {
+        auto const abs_a = std::abs(a);
+        return ((b >= 0) ? abs_a : -abs_a);
+    };
+
+    ft = f;
+    fa = abs(ft);
+    ht = h;
+    ha = abs(h);
+    //
+    //     pmax points to the maximum absolute element of matrix
+    //       pmax = 1 if f largest in absolute values
+    //       pmax = 2 if g largest in absolute values
+    //       pmax = 3 if h largest in absolute values
+    //
+    pmax = 1;
+    swap = (ha > fa);
+    if(swap)
+    {
+        pmax = 3;
+        temp = ft;
+        ft = ht;
+        ht = temp;
+        temp = fa;
+        fa = ha;
+        ha = temp;
+        //
+        //        now fa >= ha
+        //
+    }
+    gt = g;
+    ga = abs(gt);
+    if(ga == zero)
+    {
+        //
+        //        diagonal matrix
+        //
+        ssmin = ha;
+        ssmax = fa;
+        clt = one;
+        crt = one;
+        slt = zero;
+        srt = zero;
+    }
+    else
+    {
+        gasmal = true;
+        if(ga > fa)
+        {
+            pmax = 2;
+
+            cmach = 'E';
+            call_lamch(cmach, macheps);
+
+            if((fa / ga) < macheps)
+            {
+                //
+                //              case of very large ga
+                //
+                gasmal = false;
+                ssmax = ga;
+                if(ha > one)
+                {
+                    ssmin = fa / (ga / ha);
+                }
+                else
+                {
+                    ssmin = (fa / ga) * ha;
+                }
+                clt = one;
+                slt = ht / gt;
+                srt = one;
+                crt = ft / gt;
+            }
+        }
+        if(gasmal)
+        {
+            //
+            //           normal case
+            //
+            d = fa - ha;
+            if(d == fa)
+            {
+                //
+                //              copes with infinite f or h
+                //
+                l = one;
+            }
+            else
+            {
+                l = d / fa;
+            }
+            //
+            //           note that 0  <=  l <= 1
+            //
+            m = gt / ft;
+            //
+            //           note that abs(m)  <=  1/macheps
+            //
+            t = two - l;
+            //
+            //           note that t >= 1
+            //
+            mm = m * m;
+            tt = t * t;
+            s = sqrt(tt + mm);
+            //
+            //           note that 1  <=  s <= 1 + 1/macheps
+            //
+            if(l == zero)
+            {
+                r = abs(m);
+            }
+            else
+            {
+                r = sqrt(l * l + mm);
+            }
+            //
+            //           note that 0  <=  r .le. 1 + 1/macheps
+            //
+            a = half * (s + r);
+            //
+            //           note that 1  <=  a .le. 1 + abs(m)
+            //
+            ssmin = ha / a;
+            ssmax = fa * a;
+            if(mm == zero)
+            {
+                //
+                //              note that m is very tiny
+                //
+                if(l == zero)
+                {
+                    t = sign(two, ft) * sign(one, gt);
+                }
+                else
+                {
+                    t = gt / sign(d, ft) + m / t;
+                }
+            }
+            else
+            {
+                t = (m / (s + t) + m / (r + l)) * (one + a);
+            }
+            l = sqrt(t * t + four);
+            crt = two / l;
+            srt = t / l;
+            clt = (crt + srt * m) / a;
+            slt = (ht / ft) * srt / a;
+        }
+    }
+    if(swap)
+    {
+        csl = srt;
+        snl = crt;
+        csr = slt;
+        snr = clt;
+    }
+    else
+    {
+        csl = clt;
+        snl = slt;
+        csr = crt;
+        snr = srt;
+    }
+    //
+    //     correct signs of ssmax and ssmin
+    //
+    if(pmax == 1)
+    {
+        tsign = sign(one, csr) * sign(one, csl) * sign(one, f);
+    }
+    if(pmax == 2)
+    {
+        tsign = sign(one, snr) * sign(one, csl) * sign(one, g);
+    }
+    if(pmax == 3)
+    {
+        tsign = sign(one, snr) * sign(one, snl) * sign(one, h);
+    }
+    ssmax = sign(ssmax, tsign);
+    ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h));
+}
+
+#endif
 
 static void call_lasq1(int& n, double& D_, double& E_, double& rwork_, int& info_arg)
 {

From a220550a8608b2fb0bdb244c61e2f437a57a25b9 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Thu, 27 Jun 2024 15:37:16 -0400
Subject: [PATCH 19/35] c++ version of swap

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index db62622db..5f18fbc1e 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -1001,6 +1001,7 @@ static void call_lamch(char& cmach, float& eps)
 
 #endif
 
+#ifdef USE_LAPACK
 static void call_swap(int& n,
                       rocblas_complex_num<float>& zx,
                       int& incx,
@@ -1038,6 +1039,25 @@ static void call_swap(int& n, double& zx, int& incx, double& zy, int& incy)
 {
     dswap_(&n, &zx, &incx, &zy, &incy);
 }
+#else
+
+template <typename T, typename I>
+static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy)
+{
+    T* const x = &(x_in);
+    T* const y = &(y_in);
+    for(I i = 0; i < n; i++)
+    {
+        I const ix = i * incx;
+        I const iy = i * incy;
+
+        T const temp = x[ix];
+        x[ix] = y[iy];
+        y[iy] = temp;
+    }
+}
+
+#endif
 
 #ifdef USE_LAPACK
 static void call_las2(double& f, double& g, double& h, double& ssmin, double& ssmax)

From cc701a7fcf767415ff004720aa9799d5a2aa4713 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Thu, 27 Jun 2024 21:52:53 -0400
Subject: [PATCH 20/35] c++ version of lartg

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 284 ++++++++++++++++++
 1 file changed, 284 insertions(+)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 5f18fbc1e..ab60e3789 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -989,6 +989,7 @@ static void call_lamch(char& cmach, double& eps)
 {
     eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<double>::epsilon()
         : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<double>::min()
+        : ((cmach == 'B') || (cmach == 's')) ? FLT_RADIX
                                              : std::numeric_limits<double>::min();
 }
 
@@ -996,6 +997,7 @@ static void call_lamch(char& cmach, float& eps)
 {
     eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<float>::epsilon()
         : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<float>::min()
+        : ((cmach == 'B') || (cmach == 's')) ? FLT_RADIX
                                              : std::numeric_limits<float>::min();
 }
 
@@ -1144,6 +1146,8 @@ static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax)
 
 #endif
 
+#ifdef USE_LAPACK
+
 static void call_lartg(double& f, double& g, double& c, double& s, double& r)
 {
     dlartg_(&f, &g, &c, &s, &r);
@@ -1171,6 +1175,286 @@ static void call_lartg(std::complex<double>& f,
 {
     zlartg_(&f, &g, &c, &s, &r);
 }
+#else
+
+static float real_part(float z)
+{
+    return (z);
+};
+static float real_part(std::complex<float> z)
+{
+    return (z.real());
+};
+static float real_part(rocblas_complex_num<float> z)
+{
+    return (z.real());
+};
+
+static double real_part(double z)
+{
+    return (z);
+};
+static double real_part(std::complex<double> z)
+{
+    return (z.real());
+};
+static double real_part(rocblas_complex_num<double> z)
+{
+    return (z.real());
+};
+
+static float imag_part(float z)
+{
+    return (0);
+};
+static float imag_part(std::complex<float> z)
+{
+    return (z.imag());
+};
+static float imag_part(rocblas_complex_num<float> z)
+{
+    return (z.imag());
+};
+
+static double imag_part(double z)
+{
+    return (0);
+};
+static double imag_part(std::complex<double> z)
+{
+    return (z.imag());
+};
+static double imag_part(rocblas_complex_num<double> z)
+{
+    return (z.imag());
+};
+
+template <typename T, typename S>
+static void call_lartg(T& f, T& g, S& cs, T& sn, T& r)
+{
+    // ------------------------------------------------------
+    // lartg generates a plane rotation so that
+    // [  cs  sn ] * [ f ] = [ r ]
+    // [ -sn  cs ]   [ g ]   [ 0 ]
+    //
+    // where cs * cs + abs(sn)*abs(sn) == 1
+    // if g == 0, then cs == 1, sn == 0
+    // if f == 0, then cs = 0, sn is chosen so that r is real
+    // ------------------------------------------------------
+
+    auto Not = [](bool x) { return (!x); };
+    auto abs = [](auto x) { return (std::abs(x)); };
+    auto dble = [](auto z) { return (static_cast<double>(real_part(z))); };
+    auto dimag = [](auto z) { return (static_cast<double>(imag_part(z))); };
+    auto log = [](auto x) { return (std::log(x)); };
+    auto sqrt = [](auto x) { return (std::sqrt(x)); };
+    auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
+    auto disnan = [](auto x) -> bool { return (isnan(x)); };
+    auto dcmplx = [](auto x, auto y) -> T {
+        bool constexpr is_complex_type
+            = !(std::is_same<T, float>::value || std::is_same<T, double>::value);
+
+        if constexpr(is_complex_type)
+        {
+            return (T(x, y));
+        }
+        else
+        {
+            return (T(x));
+        };
+    };
+    auto dconjg = [&](auto z) { return (dcmplx(dble(z), -dimag(z))); };
+
+    auto square = [](auto x) { return (x * x); };
+
+    auto abs1 = [&](auto ff) { return (max(abs(dble(ff)), abs(dimag(ff)))); };
+    auto abssq = [&](auto ff) { return (square(dble(ff)) + square(dimag(ff))); };
+
+    // -----------------------------------------
+    // compute  sqrt( x * x + y * y )
+    // without unnecessary overflow or underflow
+    // -----------------------------------------
+    auto dlapy2 = [&](auto x, auto y) {
+        auto const one = 1;
+        auto const zero = 0;
+
+        auto ddlapy2 = x;
+        bool const x_is_nan = disnan(x);
+        bool const y_is_nan = disnan(y);
+        if(x_is_nan)
+            ddlapy2 = x;
+        if(y_is_nan)
+            ddlapy2 = y;
+
+        if(Not(x_is_nan || y_is_nan))
+        {
+            auto const xabs = abs(x);
+            auto const yabs = abs(y);
+            auto const w = max(xabs, yabs);
+            auto const z = min(xabs, yabs);
+            if(z == zero)
+            {
+                ddlapy2 = w;
+            }
+            else
+            {
+                ddlapy2 = w * sqrt(one + square(z / w));
+            }
+        }
+        return (ddlapy2);
+    };
+
+    char cmach = 'E';
+    S const zero = 0;
+    S const one = 1;
+    S const two = 2;
+    T const czero = 0;
+
+    bool has_work;
+    bool first;
+    int count, i;
+    S d, di, dr, eps, f2, f2s, g2, g2s, safmin;
+    S safmn2, safmx2, scale;
+    T ff, fs, gs;
+
+    // safmin = dlamch( 's' )
+    cmach = 'S';
+    call_lamch(cmach, safmin);
+
+    // eps = dlamch( 'e' )
+    cmach = 'E';
+    call_lamch(cmach, eps);
+
+    // safmn2 = dlamch( 'b' )**int( log( safmin / eps ) / log( dlamch( 'b' ) ) / two )
+    cmach = 'B';
+    S radix = 2;
+    call_lamch(cmach, radix);
+
+    int const npow = (log(safmin / eps) / log(radix) / two);
+    safmn2 = std::pow(radix, npow);
+    safmx2 = one / safmn2;
+    scale = max(abs1(f), abs1(g));
+    fs = f;
+    gs = g;
+    count = 0;
+
+    if(scale >= safmx2)
+    {
+    L10:
+        do
+        {
+            count = count + 1;
+            fs = fs * safmn2;
+            gs = gs * safmn2;
+            scale = scale * safmn2;
+            // if( (scale >= safmx2) &&  (count  <  20) ) go to L10
+            has_work = ((scale >= safmx2) && (count < 20));
+        } while(has_work);
+    }
+    else
+    {
+        if(scale <= safmn2)
+        {
+            if((g == czero) || disnan(abs(g)))
+            {
+                cs = one;
+                sn = czero;
+                r = f;
+                return;
+            }
+        L20:
+            do
+            {
+                count = count - 1;
+                fs = fs * safmx2;
+                gs = gs * safmx2;
+                scale = scale * safmx2;
+                // if( scale <= safmn2 )        goto L20;
+                has_work = (scale <= safmn2);
+            } while(has_work);
+        }
+        f2 = abssq(fs);
+        g2 = abssq(gs);
+        if(f2 <= max(g2, one) * safmin)
+        {
+            //
+            //        this is a rare case: f is very small.
+            //
+            if(f == czero)
+            {
+                cs = zero;
+                r = dlapy2(dble(g), dimag(g));
+                //           do complex/real division explicitly with two real divisions
+                d = dlapy2(dble(gs), dimag(gs));
+                sn = dcmplx(dble(gs) / d, -dimag(gs) / d);
+                return;
+            }
+            f2s = dlapy2(dble(fs), dimag(fs));
+            //        g2 and g2s are accurate
+            //        g2 is at least safmin, and g2s is at least safmn2
+            g2s = sqrt(g2);
+            //        error in cs from underflow in f2s is at most
+            //        unfl / safmn2  <  sqrt(unfl*eps) .lt. eps
+            //        if max(g2,one)=g2,  then f2  <  g2*safmin,
+            //        and so cs  <  sqrt(safmin)
+            //        if max(g2,one)=one,  then f2  <  safmin
+            //        and so cs  <  sqrt(safmin)/safmn2 = sqrt(eps)
+            //        therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s
+            cs = f2s / g2s;
+            //        make sure abs(ff) = 1
+            //        do complex/real division explicitly with 2 real divisions
+            if(abs1(f) > one)
+            {
+                d = dlapy2(dble(f), dimag(f));
+                ff = dcmplx(dble(f) / d, dimag(f) / d);
+            }
+            else
+            {
+                dr = safmx2 * dble(f);
+                di = safmx2 * dimag(f);
+                d = dlapy2(dr, di);
+                ff = dcmplx(dr / d, di / d);
+            }
+            sn = ff * dcmplx(dble(gs) / g2s, -dimag(gs) / g2s);
+            r = cs * f + sn * g;
+        }
+        else
+        {
+            //
+            //        this is the most common case.
+            //        neither f2 nor f2/g2 are less than safmin
+            //        f2s cannot overflow, and it is accurate
+            //
+            f2s = sqrt(one + g2 / f2);
+            //        do the f2s(real)*fs(complex) multiply with two real multiplies
+            r = dcmplx(f2s * dble(fs), f2s * dimag(fs));
+            cs = one / f2s;
+            d = f2 + g2;
+            //        do complex/real division explicitly with two real divisions
+            sn = dcmplx(dble(r) / d, dimag(r) / d);
+            sn = sn * dconjg(gs);
+            if(count != 0)
+            {
+                if(count > 0)
+                {
+                    for(i = 1; i <= count; i++)
+                    {
+                        r = r * safmx2;
+                    };
+                }
+                else
+                {
+                    for(i = 1; i <= -count; i++)
+                    {
+                        r = r * safmn2;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#endif
 
 #ifdef USE_LAPACK
 static void call_scal(int& n, rocblas_complex_num<float>& da, rocblas_complex_num<float>& zx, int& incx)

From 9fb6ceb3fba36931318d3a48d7e1d8dbe0ef42c2 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Thu, 27 Jun 2024 23:46:28 -0400
Subject: [PATCH 21/35] c++ version of lasr

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 60 +++++++++++++------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index ab60e3789..5244534f4 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -57,24 +57,18 @@ ROCSOLVER_BEGIN_NAMESPACE
 #endif
 
 template <typename S, typename T, typename I>
-__global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char const side,
-                                                                        char const pivot,
-                                                                        char const direct,
-                                                                        I const m,
-                                                                        I const n,
-                                                                        S const* const c_,
-                                                                        S const* const s_,
-                                                                        T* const A_,
-                                                                        I const lda)
+__host__ __device__ static void lasr_body(char const side,
+                                          char const pivot,
+                                          char const direct,
+                                          I const m,
+                                          I const n,
+                                          S const* const c_,
+                                          S const* const s_,
+                                          T* const A_,
+                                          I const lda,
+                                          I const tid,
+                                          I const i_inc)
 {
-    const auto nblocks = hipGridDim_x;
-    const auto nthreads_per_block = hipBlockDim_x;
-    const auto nthreads = nblocks * nthreads_per_block;
-    const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-    const auto i_inc = nthreads;
-    const auto ij_nb = nthreads;
-    const auto ij_start = tid;
-
     auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
     auto min = [](auto x, auto y) { return ((x < y) ? x : y); };
 
@@ -522,6 +516,26 @@ __global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char con
     return;
 }
 
+template <typename S, typename T, typename I>
+__global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char const side,
+                                                                        char const pivot,
+                                                                        char const direct,
+                                                                        I const m,
+                                                                        I const n,
+                                                                        S const* const c_,
+                                                                        S const* const s_,
+                                                                        T* const A_,
+                                                                        I const lda)
+{
+    const auto nblocks = hipGridDim_x;
+    const auto nthreads_per_block = hipBlockDim_x;
+    const auto nthreads = nblocks * nthreads_per_block;
+    I const tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = nthreads;
+
+    lasr_body<S, T, I>(side, pivot, direct, m, n, c_, s_, A_, lda, tid, i_inc);
+}
+
 template <typename S, typename T, typename I>
 static void lasr_template_gpu(char const side,
                               char const pivot,
@@ -1862,6 +1876,7 @@ static void call_lasq1(int& n, float& D_, float& E_, float& rwork_, int& info_ar
     slasq1_(&n, &D_, &E_, &rwork_, &info_arg);
 };
 
+#ifdef USE_LAPACK
 static void call_lasr(char& side,
                       char& pivot,
                       char& direct,
@@ -1932,6 +1947,17 @@ static void call_lasr(char& side,
 {
     dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
 };
+#else
+template <typename S, typename T, typename I>
+static void call_lasr(char& side, char& pivot, char& direct, I& m, I& n, S& c, S& s, T& A, I& lda)
+{
+    I const tid = 0;
+    I const i_inc = 1;
+
+    lasr_body<S, T, I>(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc);
+};
+
+#endif
 
 template <typename S, typename T, typename I>
 static void bdsqr_single_template(char uplo,

From c9a802de1258d9165673926b20f593db23c951f4 Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Fri, 28 Jun 2024 16:20:28 -0400
Subject: [PATCH 22/35] add restrict hint to lasr

---
 library/src/auxiliary/rocauxiliary_bdsqr_host.hpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index 5244534f4..f94deb344 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -62,9 +62,9 @@ __host__ __device__ static void lasr_body(char const side,
                                           char const direct,
                                           I const m,
                                           I const n,
-                                          S const* const c_,
-                                          S const* const s_,
-                                          T* const A_,
+                                          S const* const __restrict__ c_,
+                                          S const* const __restrict__ s_,
+                                          T* const __restrict__ A_,
                                           I const lda,
                                           I const tid,
                                           I const i_inc)
@@ -77,17 +77,16 @@ __host__ __device__ static void lasr_body(char const side,
         assert((1 <= lda));
         assert((1 <= j));
 
-        // return ((i - 1) + (j - 1) * int64_t(lda));
         return (i + j * lda - (1 + lda));
     };
 
     auto indx1f = [](auto i) -> int64_t {
         assert((1 <= i));
-        return (i - int64_t(1));
+        return (i - (1));
     };
 
-    auto c = [&](auto i) -> const S& { return (c_[indx1f(i)]); };
-    auto s = [&](auto i) -> const S& { return (s_[indx1f(i)]); };
+    auto c = [&](auto i) -> const S { return (c_[(i)-1]); };
+    auto s = [&](auto i) -> const S { return (s_[(i)-1]); };
     auto A = [&](auto i, auto j) -> T& { return (A_[indx2f(i, j, lda)]); };
 
     const S one = 1;

From 7ce4cc51dce8278aa85e35a9f861335a19d2ec4c Mon Sep 17 00:00:00 2001
From: Eduardo D'Azevedo <Eduardo.DAzevedo@amd.com>
Date: Fri, 28 Jun 2024 23:15:08 -0400
Subject: [PATCH 23/35] minor change

---
 .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 68 ++++++++++++++-----
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
index f94deb344..3bf801519 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
@@ -92,6 +92,8 @@ __host__ __device__ static void lasr_body(char const side,
     const S one = 1;
     const S zero = 0;
 
+    constexpr bool use_reorder = true;
+
     // ----------------
     // check arguments
     // ----------------
@@ -141,21 +143,38 @@ __host__ __device__ static void lasr_body(char const side,
         //  P = P(z-1) * ... * P(2) * P(1)
         //  -----------------------------
         {
-            for(I j = 1; j <= (m - 1); j++)
+            if constexpr(use_reorder)
             {
-                const auto ctemp = c(j);
-                const auto stemp = s(j);
-                if((ctemp != one) || (stemp != zero))
+                for(I i = 1 + tid; i <= n; i += i_inc)
                 {
-                    for(I i = 1 + tid; i <= n; i += i_inc)
+                    for(I j = 1; j <= (m - 1); j++)
                     {
+                        const auto ctemp = c(j);
+                        const auto stemp = s(j);
                         const auto temp = A(j + 1, i);
                         A(j + 1, i) = ctemp * temp - stemp * A(j, i);
                         A(j, i) = stemp * temp + ctemp * A(j, i);
                     }
-                };
-            };
-        };
+                }
+            }
+            else
+            {
+                for(I j = 1; j <= (m - 1); j++)
+                {
+                    const auto ctemp = c(j);
+                    const auto stemp = s(j);
+                    if((ctemp != one) || (stemp != zero))
+                    {
+                        for(I i = 1 + tid; i <= n; i += i_inc)
+                        {
+                            const auto temp = A(j + 1, i);
+                            A(j + 1, i) = ctemp * temp - stemp * A(j, i);
+                            A(j, i) = stemp * temp + ctemp * A(j, i);
+                        }
+                    }
+                }
+            }
+        }
 
         return;
     };
@@ -173,20 +192,37 @@ __host__ __device__ static void lasr_body(char const side,
         auto const istart = 1;
         auto const iend = n;
 
-        for(I j = jend; j >= jstart; j--)
+        if constexpr(use_reorder)
         {
-            const auto ctemp = c(j);
-            const auto stemp = s(j);
-            if((ctemp != one) || (stemp != zero))
+            for(I i = istart + tid; i <= iend; i += i_inc)
             {
-                for(I i = istart + tid; i <= iend; i += i_inc)
+                for(I j = jend; j >= jstart; j--)
                 {
+                    const auto ctemp = c(j);
+                    const auto stemp = s(j);
                     const auto temp = A(j + 1, i);
                     A(j + 1, i) = ctemp * temp - stemp * A(j, i);
                     A(j, i) = stemp * temp + ctemp * A(j, i);
-                };
-            };
-        };
+                }
+            }
+        }
+        else
+        {
+            for(I j = jend; j >= jstart; j--)
+            {
+                const auto ctemp = c(j);
+                const auto stemp = s(j);
+                if((ctemp != one) || (stemp != zero))
+                {
+                    for(I i = istart + tid; i <= iend; i += i_inc)
+                    {
+                        const auto temp = A(j + 1, i);
+                        A(j + 1, i) = ctemp * temp - stemp * A(j, i);
+                        A(j, i) = stemp * temp + ctemp * A(j, i);
+                    }
+                }
+            }
+        }
 
         return;
     };

From d66f64110ebf7dfb6b2aa949f3a9b158175868b8 Mon Sep 17 00:00:00 2001
From: Troy Alderson <58866654+tfalders@users.noreply.github.com>
Date: Tue, 5 Nov 2024 09:11:46 -0700
Subject: [PATCH 24/35] Added algorithm selection infrastructure (#12)

* Remove LAPACK calls

* Added basic hybrid algorithm infrastructure

* Allow selection of algorithm mode per function

* Fix test failure

* Added alg_mode argument to benchmark client

* Added rocsolver_function_gesvd

* Added basic checksum to rocsolver_handle_data

* Updated documentation

* Updated changelog

* Renamed bdsqr_host to bdsqr_hybrid

* Bump minimum rocBLAS version
---
 CHANGELOG.md                                  |   9 +
 CMakeLists.txt                                |   2 +-
 clients/benchmarks/client.cpp                 |   7 +
 clients/common/auxiliary/testing_bdsqr.hpp    |   5 +
 clients/common/lapack/testing_gesvd.hpp       |   5 +
 clients/common/misc/rocsolver_arguments.hpp   |   2 +
 clients/gtest/auxiliary/bdsqr_gtest.cpp       |  40 +-
 clients/gtest/lapack/gesvd_gtest.cpp          |  83 ++-
 docs/index.rst                                |   4 +-
 docs/reference/{logging.rst => helpers.rst}   |  61 +-
 docs/reference/index.rst                      |   8 +-
 docs/reference/types.rst                      |   8 +
 library/include/rocsolver/rocsolver-aliases.h |   4 +-
 .../include/rocsolver/rocsolver-extra-types.h |  19 +-
 .../include/rocsolver/rocsolver-functions.h   |  67 +-
 library/src/CMakeLists.txt                    |   1 +
 library/src/auxiliary/rocauxiliary_bdsqr.hpp  | 127 ++--
 ...host.hpp => rocauxiliary_bdsqr_hybrid.hpp} | 672 +-----------------
 library/src/common/rocsolver_handle.cpp       | 132 ++++
 library/src/include/rocsolver_handle.hpp      |  44 ++
 20 files changed, 531 insertions(+), 769 deletions(-)
 rename docs/reference/{logging.rst => helpers.rst} (79%)
 rename library/src/auxiliary/{rocauxiliary_bdsqr_host.hpp => rocauxiliary_bdsqr_hybrid.hpp} (83%)
 create mode 100644 library/src/common/rocsolver_handle.cpp
 create mode 100644 library/src/include/rocsolver_handle.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 013478027..117cc3e8f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,16 @@
 Full documentation for rocSOLVER is available at the [rocSOLVER documentation](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/index.html).
 
 ## (Unreleased) rocSOLVER
+
 ### Added
+
+* Algorithm selection APIs for hybrid computation
+* SVD of bidiagonal matrices routine:
+    - BDSQR now supports hybrid computation
+* SVD of general matrices routine:
+    - GESVD now supports hybrid computation
+
+### Optimized
 ### Changed
 ### Removed
 ### Optimized
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0c4ff627e..9c436d304 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,7 +197,7 @@ find_package(hip REQUIRED CONFIG PATHS ${ROCM_PATH} /opt/rocm)
 find_package(rocblas REQUIRED CONFIG PATHS ${ROCM_PATH})
 get_imported_target_location(location roc::rocblas)
 message(STATUS "Found rocBLAS: ${location}")
-set(rocblas_minimum 4.1)
+set(rocblas_minimum 4.4)
 rocm_package_add_dependencies(SHARED_DEPENDS "rocblas >= ${rocblas_minimum}")
 rocm_package_add_rpm_dependencies(STATIC_DEPENDS "rocblas-static-devel >= ${rocblas_minimum}")
 rocm_package_add_deb_dependencies(STATIC_DEPENDS "rocblas-static-dev >= ${rocblas_minimum}")
diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp
index d9e28a9c2..981b86017 100644
--- a/clients/benchmarks/client.cpp
+++ b/clients/benchmarks/client.cpp
@@ -125,6 +125,13 @@ try
             "                           Reported time will be the average.\n"
             "                           ")
 
+        ("alg_mode",
+         value<rocblas_int>(&argus.alg_mode)->default_value(0),
+            "Select different algorithm mode? 0 = GPU-only, 1 = Hybrid\n"
+            "                           This will change how the algorithm operates.\n"
+            "                           Only applicable to functions with hybrid support."
+            "                           ")
+
         ("mem_query",
          value<rocblas_int>(&argus.mem_query)->default_value(0),
             "Calculate the required amount of device workspace memory? 0 = No, 1 = Yes.\n"
diff --git a/clients/common/auxiliary/testing_bdsqr.hpp b/clients/common/auxiliary/testing_bdsqr.hpp
index 2f2a984f4..5946bb93a 100644
--- a/clients/common/auxiliary/testing_bdsqr.hpp
+++ b/clients/common/auxiliary/testing_bdsqr.hpp
@@ -478,6 +478,11 @@ void testing_bdsqr(Arguments& argus)
     rocblas_fill uplo = char2rocblas_fill(uploC);
     rocblas_int hot_calls = argus.iters;
 
+    if(argus.alg_mode)
+        EXPECT_ROCBLAS_STATUS(
+            rocsolver_set_alg_mode(handle, rocsolver_function_bdsqr, rocsolver_alg_mode_hybrid),
+            rocblas_status_success);
+
     // check non-supported values
     if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower)
     {
diff --git a/clients/common/lapack/testing_gesvd.hpp b/clients/common/lapack/testing_gesvd.hpp
index 2fbf49dcd..8f5745826 100644
--- a/clients/common/lapack/testing_gesvd.hpp
+++ b/clients/common/lapack/testing_gesvd.hpp
@@ -548,6 +548,11 @@ void testing_gesvd(Arguments& argus)
     rocblas_int bc = argus.batch_count;
     rocblas_int hot_calls = argus.iters;
 
+    if(argus.alg_mode)
+        EXPECT_ROCBLAS_STATUS(
+            rocsolver_set_alg_mode(handle, rocsolver_function_gesvd, rocsolver_alg_mode_hybrid),
+            rocblas_status_success);
+
     // check non-supported values
     if(rightv == rocblas_svect_overwrite && leftv == rocblas_svect_overwrite)
     {
diff --git a/clients/common/misc/rocsolver_arguments.hpp b/clients/common/misc/rocsolver_arguments.hpp
index 34006f51a..bca8535b6 100644
--- a/clients/common/misc/rocsolver_arguments.hpp
+++ b/clients/common/misc/rocsolver_arguments.hpp
@@ -33,6 +33,7 @@
 #include <fmt/ostream.h>
 #include <fmt/ranges.h>
 #include <rocblas/rocblas.h>
+#include <rocsolver/rocsolver.h>
 
 #include "program_options.hpp"
 
@@ -54,6 +55,7 @@ class Arguments : private std::map<std::string, variable_value>
     rocblas_int perf = 0;
     rocblas_int singular = 0;
     rocblas_int iters = 5;
+    rocblas_int alg_mode = 0;
     rocblas_int mem_query = 0;
     rocblas_int profile = 0;
     rocblas_int profile_kernels = 0;
diff --git a/clients/gtest/auxiliary/bdsqr_gtest.cpp b/clients/gtest/auxiliary/bdsqr_gtest.cpp
index b963f6029..4e78b0bea 100644
--- a/clients/gtest/auxiliary/bdsqr_gtest.cpp
+++ b/clients/gtest/auxiliary/bdsqr_gtest.cpp
@@ -112,7 +112,8 @@ Arguments bdsqr_setup_arguments(bdsqr_tuple tup)
     return arg;
 }
 
-class BDSQR : public ::TestWithParam<bdsqr_tuple>
+template <rocblas_int MODE>
+class BDSQR_BASE : public ::TestWithParam<bdsqr_tuple>
 {
 protected:
     void TearDown() override
@@ -124,6 +125,7 @@ class BDSQR : public ::TestWithParam<bdsqr_tuple>
     void run_tests()
     {
         Arguments arg = bdsqr_setup_arguments(GetParam());
+        arg.alg_mode = MODE;
 
         if(arg.peek<rocblas_int>("n") == 0 && arg.peek<char>("uplo") == 'L')
             testing_bdsqr_bad_arg<T>();
@@ -132,6 +134,14 @@ class BDSQR : public ::TestWithParam<bdsqr_tuple>
     }
 };
 
+class BDSQR : public BDSQR_BASE<0>
+{
+};
+
+class BDSQR_HYBRID : public BDSQR_BASE<1>
+{
+};
+
 // non-batch tests
 
 TEST_P(BDSQR, __float)
@@ -154,8 +164,36 @@ TEST_P(BDSQR, __double_complex)
     run_tests<rocblas_double_complex>();
 }
 
+TEST_P(BDSQR_HYBRID, __float)
+{
+    run_tests<float>();
+}
+
+TEST_P(BDSQR_HYBRID, __double)
+{
+    run_tests<double>();
+}
+
+TEST_P(BDSQR_HYBRID, __float_complex)
+{
+    run_tests<rocblas_float_complex>();
+}
+
+TEST_P(BDSQR_HYBRID, __double_complex)
+{
+    run_tests<rocblas_double_complex>();
+}
+
 INSTANTIATE_TEST_SUITE_P(daily_lapack,
                          BDSQR,
                          Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range)));
 
 INSTANTIATE_TEST_SUITE_P(checkin_lapack, BDSQR, Combine(ValuesIn(size_range), ValuesIn(opt_range)));
+
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         BDSQR_HYBRID,
+                         Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range)));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack,
+                         BDSQR_HYBRID,
+                         Combine(ValuesIn(size_range), ValuesIn(opt_range)));
diff --git a/clients/gtest/lapack/gesvd_gtest.cpp b/clients/gtest/lapack/gesvd_gtest.cpp
index 51acceda3..3ee770c1d 100644
--- a/clients/gtest/lapack/gesvd_gtest.cpp
+++ b/clients/gtest/lapack/gesvd_gtest.cpp
@@ -158,7 +158,8 @@ Arguments gesvd_setup_arguments(gesvd_tuple tup)
     return arg;
 }
 
-class GESVD : public ::TestWithParam<gesvd_tuple>
+template <rocblas_int MODE>
+class GESVD_BASE : public ::TestWithParam<gesvd_tuple>
 {
 protected:
     void TearDown() override
@@ -170,6 +171,7 @@ class GESVD : public ::TestWithParam<gesvd_tuple>
     void run_tests()
     {
         Arguments arg = gesvd_setup_arguments(GetParam());
+        arg.alg_mode = MODE;
 
         if(arg.peek<rocblas_int>("m") == 0 && arg.peek<rocblas_int>("n") == 0
            && arg.peek<char>("left_svect") == 'N' && arg.peek<char>("right_svect") == 'N')
@@ -180,6 +182,14 @@ class GESVD : public ::TestWithParam<gesvd_tuple>
     }
 };
 
+class GESVD : public GESVD_BASE<0>
+{
+};
+
+class GESVD_HYBRID : public GESVD_BASE<1>
+{
+};
+
 // non-batch tests
 
 TEST_P(GESVD, __float)
@@ -202,6 +212,26 @@ TEST_P(GESVD, __double_complex)
     run_tests<false, false, rocblas_double_complex>();
 }
 
+TEST_P(GESVD_HYBRID, __float)
+{
+    run_tests<false, false, float>();
+}
+
+TEST_P(GESVD_HYBRID, __double)
+{
+    run_tests<false, false, double>();
+}
+
+TEST_P(GESVD_HYBRID, __float_complex)
+{
+    run_tests<false, false, rocblas_float_complex>();
+}
+
+TEST_P(GESVD_HYBRID, __double_complex)
+{
+    run_tests<false, false, rocblas_double_complex>();
+}
+
 // batched tests
 
 TEST_P(GESVD, batched__float)
@@ -224,6 +254,26 @@ TEST_P(GESVD, batched__double_complex)
     run_tests<true, true, rocblas_double_complex>();
 }
 
+TEST_P(GESVD_HYBRID, batched__float)
+{
+    run_tests<true, true, float>();
+}
+
+TEST_P(GESVD_HYBRID, batched__double)
+{
+    run_tests<true, true, double>();
+}
+
+TEST_P(GESVD_HYBRID, batched__float_complex)
+{
+    run_tests<true, true, rocblas_float_complex>();
+}
+
+TEST_P(GESVD_HYBRID, batched__double_complex)
+{
+    run_tests<true, true, rocblas_double_complex>();
+}
+
 // strided_batched tests
 
 TEST_P(GESVD, strided_batched__float)
@@ -246,11 +296,36 @@ TEST_P(GESVD, strided_batched__double_complex)
     run_tests<false, true, rocblas_double_complex>();
 }
 
-// daily_lapack tests normal execution with medium to large sizes
+TEST_P(GESVD_HYBRID, strided_batched__float)
+{
+    run_tests<false, true, float>();
+}
+
+TEST_P(GESVD_HYBRID, strided_batched__double)
+{
+    run_tests<false, true, double>();
+}
+
+TEST_P(GESVD_HYBRID, strided_batched__float_complex)
+{
+    run_tests<false, true, rocblas_float_complex>();
+}
+
+TEST_P(GESVD_HYBRID, strided_batched__double_complex)
+{
+    run_tests<false, true, rocblas_double_complex>();
+}
+
 INSTANTIATE_TEST_SUITE_P(daily_lapack,
                          GESVD,
                          Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range)));
 
-// checkin_lapack tests normal execution with small sizes, invalid sizes,
-// quick returns, and corner cases
 INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESVD, Combine(ValuesIn(size_range), ValuesIn(opt_range)));
+
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         GESVD_HYBRID,
+                         Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range)));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack,
+                         GESVD_HYBRID,
+                         Combine(ValuesIn(size_range), ValuesIn(opt_range)));
diff --git a/docs/index.rst b/docs/index.rst
index d7725421c..e1c39a266 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -8,7 +8,7 @@
 rocSOLVER documentation
 ********************************************************************
 
-rocSOLVER is an implementation of `LAPACK routines <https://www.netlib.org/lapack/explore-html/modules.html>`_ 
+rocSOLVER is an implementation of `LAPACK routines <https://www.netlib.org/lapack/explore-html/modules.html>`_
 on top of the :doc:`AMD ROCm platform <rocm:index>`. rocSOLVER is implemented in the
 :doc:`HIP programming language <hip:index>` and optimized for AMD GPUs.
 
@@ -39,7 +39,7 @@ The rocSOLVER documentation is structured as follows:
     * :ref:`lapackfunc`
     * :ref:`lapack-like`
     * :ref:`refactor`
-    * :ref:`api_logging`
+    * :ref:`helpers`
     * :ref:`tuning_label`
     * :ref:`deprecated`
 
diff --git a/docs/reference/logging.rst b/docs/reference/helpers.rst
similarity index 79%
rename from docs/reference/logging.rst
rename to docs/reference/helpers.rst
index 26c02e42f..b643b6807 100644
--- a/docs/reference/logging.rst
+++ b/docs/reference/helpers.rst
@@ -2,12 +2,50 @@
   :description: rocSOLVER documentation and API reference library
   :keywords: rocSOLVER, ROCm, API, documentation
 
-.. _api_logging:
+.. _helpers:
 
 *****************************************************
-rocSOLVER Logging Functions and Library Information
+rocSOLVER Library and Logging Functions
 *****************************************************
 
+
+
+Library information
+===============================
+
+.. contents:: List of library information functions
+   :local:
+   :backlinks: top
+
+rocsolver_get_version_string()
+------------------------------------
+.. doxygenfunction:: rocsolver_get_version_string
+
+rocsolver_get_version_string_size()
+------------------------------------
+.. doxygenfunction:: rocsolver_get_version_string_size
+
+
+
+Algorithm selection
+===============================
+
+.. contents:: List of algorithm selection functions
+   :local:
+   :backlinks: top
+
+rocsolver_set_alg_mode()
+------------------------------------
+.. doxygenfunction:: rocsolver_set_alg_mode
+
+rocsolver_get_alg_mode()
+------------------------------------
+.. doxygenfunction:: rocsolver_get_alg_mode
+
+
+
+.. _api_logging:
+
 Logging functions
 ===============================
 
@@ -45,22 +83,3 @@ rocsolver_log_flush_profile()
 ---------------------------------
 .. doxygenfunction:: rocsolver_log_flush_profile
 
-
-
-.. _libraryinfo:
-
-Library information
-===============================
-
-.. contents:: List of library information functions
-   :local:
-   :backlinks: top
-
-rocsolver_get_version_string()
-------------------------------------
-.. doxygenfunction:: rocsolver_get_version_string
-
-rocsolver_get_version_string_size()
-------------------------------------
-.. doxygenfunction:: rocsolver_get_version_string_size
-
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index b82b12dc2..18a2fb338 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -8,15 +8,15 @@
 Reference
 ########################################
 
-This section provides technical descriptions and important information about 
-the different rocSOLVER APIs and library components. 
+This section provides technical descriptions and important information about
+the different rocSOLVER APIs and library components.
 
 * :ref:`intro`
 * :ref:`rocsolver-types`
-* :ref:`rocsolver_auxiliary_functions` 
+* :ref:`rocsolver_auxiliary_functions`
 * :ref:`lapackfunc`
 * :ref:`lapack-like`
 * :ref:`refactor`
-* :ref:`api_logging`
+* :ref:`helpers`
 * :ref:`tuning_label`
 * :ref:`deprecated`
diff --git a/docs/reference/types.rst b/docs/reference/types.rst
index 8191098ba..9de6bcc35 100644
--- a/docs/reference/types.rst
+++ b/docs/reference/types.rst
@@ -66,3 +66,11 @@ rocsolver_rfinfo
 rocsolver_rfinfo_mode
 ------------------------
 .. doxygenenum:: rocsolver_rfinfo_mode
+
+rocsolver_alg_mode
+------------------------
+.. doxygentypedef:: rocsolver_alg_mode
+
+rocsolver_function
+------------------------
+.. doxygentypedef:: rocsolver_function
diff --git a/library/include/rocsolver/rocsolver-aliases.h b/library/include/rocsolver/rocsolver-aliases.h
index fec1ea401..de938bf22 100644
--- a/library/include/rocsolver/rocsolver-aliases.h
+++ b/library/include/rocsolver/rocsolver-aliases.h
@@ -1,5 +1,5 @@
 /* **************************************************************************
- * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -91,7 +91,7 @@ ROCSOLVER_DEPRECATED_X("use rocblas_fill") typedef rocblas_fill rocsolver_fill;
  */
 ROCSOLVER_DEPRECATED_X("use rocblas_diagonal") typedef rocblas_diagonal rocsolver_diagonal;
 
-/*! \deprecated Use \c rocblas_stide.
+/*! \deprecated Use \c rocblas_side.
  */
 ROCSOLVER_DEPRECATED_X("use rocblas_side") typedef rocblas_side rocsolver_side;
 
diff --git a/library/include/rocsolver/rocsolver-extra-types.h b/library/include/rocsolver/rocsolver-extra-types.h
index 89ea80692..d423a69e6 100644
--- a/library/include/rocsolver/rocsolver-extra-types.h
+++ b/library/include/rocsolver/rocsolver-extra-types.h
@@ -1,5 +1,5 @@
 /* **************************************************************************
- * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -164,4 +164,21 @@ typedef enum rocsolver_rfinfo_mode_
     = 272, /**< To work with Cholesky factorization (for symmetric positive definite sparse matrices). */
 } rocsolver_rfinfo_mode;
 
+/*! \brief Used by specific functions to specify the algorithm mode.
+ ********************************************************************************/
+typedef enum rocsolver_alg_mode_
+{
+    rocsolver_alg_mode_gpu
+    = 281, /**< Computations are all performed on the GPU. This is the default mode. */
+    rocsolver_alg_mode_hybrid = 282, /**< Computations are performed on the CPU and GPU. */
+} rocsolver_alg_mode;
+
+/*! \brief Used to specify a function with multiple supported algorithm modes.
+ ********************************************************************************/
+typedef enum rocsolver_function_
+{
+    rocsolver_function_bdsqr = 401,
+    rocsolver_function_gesvd = 402,
+} rocsolver_function;
+
 #endif /* ROCSOLVER_EXTRA_TYPES_H */
diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h
index 250d07b2a..eec72d3c8 100644
--- a/library/include/rocsolver/rocsolver-functions.h
+++ b/library/include/rocsolver/rocsolver-functions.h
@@ -141,6 +141,45 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_log_write_profile(void);
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_log_flush_profile(void);
 
+/*
+ * ===========================================================================
+ *      Hybrid algorithm enablement
+ * ===========================================================================
+ */
+
+/*! \brief SET_ALG_MODE sets the algorithm mode to be used by a specific function.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    func        #rocsolver_function.
+                The function that will use the selected algorithm mode.
+    @param[in]
+    mode        #rocsolver_alg_mode.
+                The algorithm mode that will be used by the specified function.
+    *************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_set_alg_mode(rocblas_handle handle,
+                                                       const rocsolver_function func,
+                                                       const rocsolver_alg_mode mode);
+
+/*! \brief GET_ALG_MODE gets the algorithm mode selected for use by a specific function.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    func        #rocsolver_function.
+                A function.
+    @param[out]
+    mode        pointer to #rocsolver_alg_mode.
+                On exit, the value is overwritten by the algorithm mode that will
+                be used by the specified function.
+    *************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_get_alg_mode(rocblas_handle handle,
+                                                       const rocsolver_function func,
+                                                       rocsolver_alg_mode* mode);
+
 /*
  * ===========================================================================
  *      Auxiliary functions
@@ -3708,6 +3747,10 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zunmtr(rocblas_handle handle,
     In order to carry out calculations, this method may synchronize the stream contained within the
     rocblas_handle.
 
+    \note
+    A hybrid (CPU+GPU) approach is available for BDSQR. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE"
+    to enable it.
+
     @param[in]
     handle      rocblas_handle.
     @param[in]
@@ -12443,8 +12486,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zpotri_strided_batched(rocblas_handle
     the "Tuning rocSOLVER performance" and "Memory model" sections of the documentation.
 
     \note
-    In order to carry out calculations, this method may synchronize the stream contained within the
-    rocblas_handle.
+    In order to carry out calculations, this method may synchronize the stream contained
+    within the rocblas_handle.
+
+    \note
+    A hybrid (CPU+GPU) approach is available for GESVD. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE"
+    to enable it.
 
     @param[in]
     handle      rocblas_handle.
@@ -12620,8 +12667,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd(rocblas_handle handle,
     and "Memory model" sections of the documentation.
 
     \note
-    In order to carry out calculations, this method may synchronize the stream contained within the
-    rocblas_handle.
+    In order to carry out calculations, this method may synchronize the stream contained
+    within the rocblas_handle.
+
+    \note
+    A hybrid (CPU+GPU) approach is available for GESVD_BATCHED. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE"
+    to enable it.
 
     @param[in]
     handle      rocblas_handle.
@@ -12838,8 +12889,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd_batched(rocblas_handle handle,
     and "Memory model" sections of the documentation.
 
     \note
-    In order to carry out calculations, this method may synchronize the stream contained within the
-    rocblas_handle.
+    In order to carry out calculations, this method may synchronize the stream contained
+    within the rocblas_handle.
+
+    \note
+    A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE"
+    to enable it.
 
     @param[in]
     handle      rocblas_handle.
diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
index e31b6ece0..5b2e75e34 100755
--- a/library/src/CMakeLists.txt
+++ b/library/src/CMakeLists.txt
@@ -343,6 +343,7 @@ endif()
 
 set(auxiliaries
   common/buildinfo.cpp
+  common/rocsolver_handle.cpp
   common/rocsolver_logger.cpp
   common/rocsparse.cpp
 )
diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
index 51c4b11b3..89397dbc8 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
@@ -38,7 +38,7 @@
 
 #include <cmath>
 
-#include "rocauxiliary_bdsqr_host.hpp"
+#include "rocauxiliary_bdsqr_hybrid.hpp"
 
 ROCSOLVER_BEGIN_NAMESPACE
 
@@ -1209,6 +1209,9 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle,
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
+    rocsolver_alg_mode alg_mode;
+    ROCBLAS_CHECK(rocsolver_get_alg_mode(handle, rocsolver_function_bdsqr, &alg_mode));
+
     // set tolerance and max number of iterations:
     // machine precision (considering rounding strategy)
     S eps = get_epsilon<S>() / 2;
@@ -1251,79 +1254,83 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle,
     ROCSOLVER_LAUNCH_KERNEL((bdsqr_init<T>), gridBasic, threadsBasic, 0, stream, n, D, strideD, E,
                             strideE, info, maxiter, sfm, tol, splits_map, work, strideW, completed);
 
-    bool const use_bdsqr_host = true;
-    if(use_bdsqr_host)
+    if(alg_mode == rocsolver_alg_mode_hybrid)
     {
-            ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template<T, S, W1, W2, W3, rocblas_int>(
-                handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U,
-                shiftU, ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work));
+        ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template<T, S, W1, W2, W3, rocblas_int>(
+            handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U, shiftU,
+            ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work));
     }
     else
     {
-        // rotate to upper bidiagonal if necessary
-        if(uplo == rocblas_fill_lower)
+        if(n > 1)
         {
-            ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper<T>), gridBasic, threadsUC, 0, stream, n, nu,
-                                    nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C, shiftC,
-                                    ldc, strideC, info, work, strideW, completed);
-        }
+            // rotate to upper bidiagonal if necessary
+            if(uplo == rocblas_fill_lower)
+            {
+                ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper<T>), gridBasic, threadsUC, 0, stream, n,
+                                        nu, nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C,
+                                        shiftC, ldc, strideC, info, work, strideW, completed);
+            }
 
-        rocblas_int h_iter = 0;
-        struct
-        {
-            rocblas_int completed;
-            rocblas_int num_splits;
-        } h_params;
+            rocblas_int h_iter = 0;
+            struct
+            {
+                rocblas_int completed;
+                rocblas_int num_splits;
+            } h_params;
 
-        while(h_iter < maxiter)
-        {
-            // if all instances in the batch have finished, exit the loop
-            HIP_CHECK(hipMemcpyAsync(&h_params, completed, sizeof(h_params), hipMemcpyDeviceToHost,
-                                     stream));
-            HIP_CHECK(hipStreamSynchronize(stream));
+            while(h_iter < maxiter)
+            {
+                // if all instances in the batch have finished, exit the loop
+                HIP_CHECK(hipMemcpyAsync(&h_params, completed, sizeof(h_params),
+                                         hipMemcpyDeviceToHost, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
 
-            if(h_params.completed == batch_count)
-                break;
+                if(h_params.completed == batch_count)
+                    break;
 
-            dim3 gridSplits(1, h_params.num_splits, batch_count);
-            dim3 gridVUC((nvuc_max - 1) / BS1 + 1, h_params.num_splits, batch_count);
+                dim3 gridSplits(1, h_params.num_splits, batch_count);
+                dim3 gridVUC((nvuc_max - 1) / BS1 + 1, h_params.num_splits, batch_count);
 
-            for(rocblas_int inner_iters = 0; inner_iters < BDSQR_ITERS_PER_SYNC; inner_iters++)
-            {
-                if(nvuc_max <= BDSQR_SWITCH_SIZE)
+                for(rocblas_int inner_iters = 0; inner_iters < BDSQR_ITERS_PER_SYNC; inner_iters++)
                 {
-                    // main computation of SVD
-                    ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute<BS1, T>), gridSplits, threadsBS1, 0,
-                                            stream, n, nv, nu, nc, D, strideD, E, strideE, V,
-                                            shiftV, ldv, strideV, U, shiftU, ldu, strideU, C,
-                                            shiftC, ldc, strideC, maxiter, eps, sfm, tol, minshift,
-                                            splits_map, work, incW, strideW, completed);
-                }
-                else
-                {
-                    // main computation of SVD
-                    ROCSOLVER_LAUNCH_KERNEL(
-                        (bdsqr_compute<BS1, T>), gridSplits, threadsBS1, 0, stream, n, nv, nu, nc,
-                        D, strideD, E, strideE, (W1) nullptr, shiftV, ldv, strideV, (W2) nullptr,
-                        shiftU, ldu, strideU, (W3) nullptr, shiftC, ldc, strideC, maxiter, eps, sfm,
-                        tol, minshift, splits_map, work, incW, strideW, completed);
-
-                    // update singular vectors
-                    ROCSOLVER_LAUNCH_KERNEL((bdsqr_rotate<T>), gridVUC, threadsVUC, 0, stream, n,
-                                            nv, nu, nc, V, shiftV, ldv, strideV, U, shiftU, ldu,
-                                            strideU, C, shiftC, ldc, strideC, maxiter, splits_map,
-                                            work, incW, strideW, completed);
+                    if(nvuc_max <= BDSQR_SWITCH_SIZE)
+                    {
+                        // main computation of SVD
+                        ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute<BS1, T>), gridSplits, threadsBS1, 0,
+                                                stream, n, nv, nu, nc, D, strideD, E, strideE, V,
+                                                shiftV, ldv, strideV, U, shiftU, ldu, strideU, C,
+                                                shiftC, ldc, strideC, maxiter, eps, sfm, tol,
+                                                minshift, splits_map, work, incW, strideW, completed);
+                    }
+                    else
+                    {
+                        // main computation of SVD
+                        ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute<BS1, T>), gridSplits, threadsBS1, 0,
+                                                stream, n, nv, nu, nc, D, strideD, E, strideE,
+                                                (W1) nullptr, shiftV, ldv, strideV, (W2) nullptr,
+                                                shiftU, ldu, strideU, (W3) nullptr, shiftC, ldc,
+                                                strideC, maxiter, eps, sfm, tol, minshift,
+                                                splits_map, work, incW, strideW, completed);
+
+                        // update singular vectors
+                        ROCSOLVER_LAUNCH_KERNEL((bdsqr_rotate<T>), gridVUC, threadsVUC, 0, stream,
+                                                n, nv, nu, nc, V, shiftV, ldv, strideV, U, shiftU,
+                                                ldu, strideU, C, shiftC, ldc, strideC, maxiter,
+                                                splits_map, work, incW, strideW, completed);
+                    }
+
+                    // update split block endpoints
+                    ROCSOLVER_LAUNCH_KERNEL((bdsqr_update_endpoints<T>), gridSplits, threadsBasic,
+                                            0, stream, n, E, strideE, splits_map, work, strideW,
+                                            completed);
                 }
 
-                // update split block endpoints
-                ROCSOLVER_LAUNCH_KERNEL((bdsqr_update_endpoints<T>), gridSplits, threadsBasic, 0,
-                                        stream, n, E, strideE, splits_map, work, strideW, completed);
+                // check for completion
+                h_iter += BDSQR_ITERS_PER_SYNC;
+                ROCSOLVER_LAUNCH_KERNEL((bdsqr_chk_completed<T>), gridBasic, threadsBasic, 0,
+                                        stream, n, maxiter, splits_map, work, strideW, completed);
             }
-
-            // check for completion
-            h_iter += BDSQR_ITERS_PER_SYNC;
-            ROCSOLVER_LAUNCH_KERNEL((bdsqr_chk_completed<T>), gridBasic, threadsBasic, 0, stream, n,
-                                    maxiter, splits_map, work, strideW, completed);
         }
     }
 
diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
similarity index 83%
rename from library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
rename to library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
index 3bf801519..d304c8f61 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
@@ -734,306 +734,6 @@ static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hip
                        n, x, incx, y, incy);
 }
 
-extern "C" {
-
-double dlamch_(char* cmach);
-float slamch_(char* cmach);
-
-void zswap_(int* n, std::complex<double>* zx, int* incx, std::complex<double>* zy, int* incy);
-
-void cswap_(int* n, std::complex<float>* zx, int* incx, std::complex<float>* zy, int* incy);
-
-void dswap_(int* n, double* zx, int* incx, double* zy, int* incy);
-
-void sswap_(int* n, float* zx, int* incx, float* zy, int* incy);
-
-void dlasq1_(int* n, double* D_, double* E_, double* rwork_, int* info_arg);
-void slasq1_(int* n, float* D_, float* E_, float* rwork_, int* info_arg);
-
-void zlasr_(char* side,
-            char* pivot,
-            char* direct,
-            int* m,
-            int* n,
-            double* c,
-            double* s,
-            std::complex<double>* A,
-            int* lda);
-void clasr_(char* side,
-            char* pivot,
-            char* direct,
-            int* m,
-            int* n,
-            float* c,
-            float* s,
-            std::complex<float>* A,
-            int* lda);
-void slasr_(char* side, char* pivot, char* direct, int* m, int* n, float* c, float* s, float* A, int* lda);
-void dlasr_(char* side, char* pivot, char* direct, int* m, int* n, double* c, double* s, double* A, int* lda);
-
-void dlasv2_(double* f,
-             double* g,
-             double* h,
-             double* ssmin,
-             double* ssmax,
-             double* snr,
-             double* csr,
-             double* snl,
-             double* csl);
-void slasv2_(float* f,
-             float* g,
-             float* h,
-             float* ssmin,
-             float* ssmax,
-             float* snr,
-             float* csr,
-             float* snl,
-             float* csl);
-
-void zdrot_(int* n,
-            std::complex<double>* zx,
-            int* incx,
-            std::complex<double>* zy,
-            int* incy,
-            double* c,
-            double* s);
-
-void csrot_(int* n,
-            std::complex<float>* zx,
-            int* incx,
-            std::complex<float>* zy,
-            int* incy,
-            float* c,
-            float* s);
-
-void drot_(int* n, double* dx, int* incx, double* dy, int* incy, double* c, double* s);
-
-void srot_(int* n, float* dx, int* incx, float* dy, int* incy, float* c, float* s);
-
-void zdscal_(int* n, double* da, std::complex<double>* zx, int* incx);
-void csscal_(int* n, float* da, std::complex<float>* zx, int* incx);
-void zscal_(int* n, std::complex<double>* za, std::complex<double>* zx, int* incx);
-void cscal_(int* n, std::complex<float>* za, std::complex<float>* zx, int* incx);
-void dscal_(int* n, double* da, double* zx, int* incx);
-void sscal_(int* n, float* da, float* zx, int* incx);
-
-void dlartg_(double* f, double* g, double* c, double* s, double* r);
-void slartg_(float* f, float* g, float* c, float* s, float* r);
-
-void zlartg_(std::complex<double>* f,
-             std::complex<double>* g,
-             double* c,
-             std::complex<double>* s,
-             std::complex<double>* r);
-void clartg_(std::complex<float>* f,
-             std::complex<float>* g,
-             float* c,
-             std::complex<float>* s,
-             std::complex<float>* r);
-
-void dlas2_(double* f, double* g, double* h, double* ssmin, double* ssmax);
-void slas2_(float* f, float* g, float* h, float* ssmin, float* ssmax);
-};
-
-extern "C" {
-
-void cbdsqr_(char* uplo,
-             int* n,
-             int* ncvt,
-             int* nru,
-             int* ncc,
-             float* d,
-             float* e,
-             std::complex<float>* vt,
-             int* ldvt,
-             std::complex<float>* u,
-             int* ldu,
-             std::complex<float>* c,
-             int* ldc,
-             float* rwork,
-             int* info);
-
-void zbdsqr_(char* uplo,
-             int* n,
-             int* ncvt,
-             int* nru,
-             int* ncc,
-             double* d,
-             double* e,
-             std::complex<double>* vt,
-             int* ldvt,
-             std::complex<double>* u,
-             int* ldu,
-             std::complex<double>* c,
-             int* ldc,
-             double* rwork,
-             int* info);
-
-void sbdsqr_(char* uplo,
-             int* n,
-             int* ncvt,
-             int* nru,
-             int* ncc,
-             float* d,
-             float* e,
-             float* vt,
-             int* ldvt,
-             float* u,
-             int* ldu,
-             float* c,
-             int* ldc,
-             float* rwork,
-             int* info);
-
-void dbdsqr_(char* uplo,
-             int* n,
-             int* ncvt,
-             int* nru,
-             int* ncc,
-             double* d,
-             double* e,
-             double* vt,
-             int* ldvt,
-             double* u,
-             int* ldu,
-             double* c,
-             int* ldc,
-             double* rwork,
-             int* info);
-};
-
-static void call_bdsqr(char& uplo,
-                       int& n,
-                       int& ncvt,
-                       int& nru,
-                       int& ncc,
-                       double& d,
-                       double& e,
-                       std::complex<double>& vt,
-                       int& ldvt,
-                       std::complex<double>& u,
-                       int& ldu,
-                       std::complex<double>& c,
-                       int& ldc,
-                       double& rwork,
-                       int& info)
-{
-    zbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex<double>*)&vt, &ldvt,
-            (std::complex<double>*)&u, &ldu, (std::complex<double>*)&c, &ldc, &rwork, &info);
-}
-
-static void call_bdsqr(char& uplo,
-                       int& n,
-                       int& ncvt,
-                       int& nru,
-                       int& ncc,
-                       double& d,
-                       double& e,
-                       rocblas_complex_num<double>& vt,
-                       int& ldvt,
-                       rocblas_complex_num<double>& u,
-                       int& ldu,
-                       rocblas_complex_num<double>& c,
-                       int& ldc,
-                       double& rwork,
-                       int& info)
-{
-    zbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex<double>*)&vt, &ldvt,
-            (std::complex<double>*)&u, &ldu, (std::complex<double>*)&c, &ldc, &rwork, &info);
-}
-
-static void call_bdsqr(char& uplo,
-                       int& n,
-                       int& ncvt,
-                       int& nru,
-                       int& ncc,
-                       float& d,
-                       float& e,
-                       std::complex<float>& vt,
-                       int& ldvt,
-                       std::complex<float>& u,
-                       int& ldu,
-                       std::complex<float>& c,
-                       int& ldc,
-                       float& rwork,
-                       int& info)
-{
-    cbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex<float>*)&vt, &ldvt,
-            (std::complex<float>*)&u, &ldu, (std::complex<float>*)&c, &ldc, &rwork, &info);
-}
-
-static void call_bdsqr(char& uplo,
-                       int& n,
-                       int& ncvt,
-                       int& nru,
-                       int& ncc,
-                       float& d,
-                       float& e,
-                       float& vt,
-                       int& ldvt,
-                       float& u,
-                       int& ldu,
-                       float& c,
-                       int& ldc,
-                       float& rwork,
-                       int& info)
-{
-    sbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info);
-}
-
-static void call_bdsqr(char& uplo,
-                       int& n,
-                       int& ncvt,
-                       int& nru,
-                       int& ncc,
-                       float& d,
-                       float& e,
-                       rocblas_complex_num<float>& vt,
-                       int& ldvt,
-                       rocblas_complex_num<float>& u,
-                       int& ldu,
-                       rocblas_complex_num<float>& c,
-                       int& ldc,
-                       float& rwork,
-                       int& info)
-{
-    cbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex<float>*)&vt, &ldvt,
-            (std::complex<float>*)&u, &ldu, (std::complex<float>*)&c, &ldc, &rwork, &info);
-}
-
-static void call_bdsqr(char& uplo,
-                       int& n,
-                       int& ncvt,
-                       int& nru,
-                       int& ncc,
-                       double& d,
-                       double& e,
-                       double& vt,
-                       int& ldvt,
-                       double& u,
-                       int& ldu,
-                       double& c,
-                       int& ldc,
-                       double& rwork,
-                       int& info)
-{
-    dbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info);
-}
-
-#ifdef USE_LAPACK
-static void call_lamch(char& cmach_arg, double& eps)
-{
-    char cmach = cmach_arg;
-    eps = dlamch_(&cmach);
-}
-
-static void call_lamch(char& cmach_arg, float& eps)
-{
-    char cmach = cmach_arg;
-    eps = slamch_(&cmach);
-}
-#else
-
 static void call_lamch(char& cmach, double& eps)
 {
     eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<double>::epsilon()
@@ -1050,48 +750,6 @@ static void call_lamch(char& cmach, float& eps)
                                              : std::numeric_limits<float>::min();
 }
 
-#endif
-
-#ifdef USE_LAPACK
-static void call_swap(int& n,
-                      rocblas_complex_num<float>& zx,
-                      int& incx,
-                      rocblas_complex_num<float>& zy,
-                      int& incy)
-{
-    cswap_(&n, (std::complex<float>*)&zx, &incx, (std::complex<float>*)&zy, &incy);
-}
-
-static void call_swap(int& n, std::complex<float>& zx, int& incx, std::complex<float>& zy, int& incy)
-{
-    cswap_(&n, &zx, &incx, &zy, &incy);
-}
-
-static void call_swap(int& n,
-                      rocblas_complex_num<double>& zx,
-                      int& incx,
-                      rocblas_complex_num<double>& zy,
-                      int& incy)
-{
-    zswap_(&n, (std::complex<double>*)&zx, &incx, (std::complex<double>*)&zy, &incy);
-}
-
-static void call_swap(int& n, std::complex<double>& zx, int& incx, std::complex<double>& zy, int& incy)
-{
-    zswap_(&n, &zx, &incx, &zy, &incy);
-}
-
-static void call_swap(int& n, float& zx, int& incx, float& zy, int& incy)
-{
-    sswap_(&n, &zx, &incx, &zy, &incy);
-}
-
-static void call_swap(int& n, double& zx, int& incx, double& zy, int& incy)
-{
-    dswap_(&n, &zx, &incx, &zy, &incy);
-}
-#else
-
 template <typename T, typename I>
 static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy)
 {
@@ -1108,20 +766,6 @@ static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy)
     }
 }
 
-#endif
-
-#ifdef USE_LAPACK
-static void call_las2(double& f, double& g, double& h, double& ssmin, double& ssmax)
-{
-    dlas2_(&f, &g, &h, &ssmin, &ssmax);
-}
-
-static void call_las2(float& f, float& g, float& h, float& ssmin, float& ssmax)
-{
-    slas2_(&f, &g, &h, &ssmin, &ssmax);
-}
-#else
-
 template <typename T>
 static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax)
 {
@@ -1193,39 +837,6 @@ static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax)
     }
 }
 
-#endif
-
-#ifdef USE_LAPACK
-
-static void call_lartg(double& f, double& g, double& c, double& s, double& r)
-{
-    dlartg_(&f, &g, &c, &s, &r);
-}
-
-static void call_lartg(float& f, float& g, float& c, float& s, float& r)
-{
-    slartg_(&f, &g, &c, &s, &r);
-}
-
-static void call_lartg(std::complex<float>& f,
-                       std::complex<float>& g,
-                       float& c,
-                       std::complex<float>& s,
-                       std::complex<float>& r)
-{
-    clartg_(&f, &g, &c, &s, &r);
-}
-
-static void call_lartg(std::complex<double>& f,
-                       std::complex<double>& g,
-                       double& c,
-                       std::complex<double>& s,
-                       std::complex<double>& r)
-{
-    zlartg_(&f, &g, &c, &s, &r);
-}
-#else
-
 static float real_part(float z)
 {
     return (z);
@@ -1503,60 +1114,6 @@ static void call_lartg(T& f, T& g, S& cs, T& sn, T& r)
     }
 }
 
-#endif
-
-#ifdef USE_LAPACK
-static void call_scal(int& n, rocblas_complex_num<float>& da, rocblas_complex_num<float>& zx, int& incx)
-{
-    cscal_(&n, (std::complex<float>*)&da, (std::complex<float>*)&zx, &incx);
-}
-
-static void call_scal(int& n, std::complex<float>& da, std::complex<float>& zx, int& incx)
-{
-    cscal_(&n, &da, &zx, &incx);
-}
-
-static void
-    call_scal(int& n, rocblas_complex_num<double>& da, rocblas_complex_num<double>& zx, int& incx)
-{
-    zscal_(&n, (std::complex<double>*)&da, (std::complex<double>*)&zx, &incx);
-}
-
-static void call_scal(int& n, std::complex<double>& da, std::complex<double>& zx, int& incx)
-{
-    zscal_(&n, &da, &zx, &incx);
-}
-
-static void call_scal(int& n, double& da, rocblas_complex_num<double>& zx, int& incx)
-{
-    zdscal_(&n, &da, (std::complex<double>*)&zx, &incx);
-}
-
-static void call_scal(int& n, double& da, std::complex<double>& zx, int& incx)
-{
-    zdscal_(&n, &da, &zx, &incx);
-}
-
-static void call_scal(int& n, float& da, rocblas_complex_num<float>& zx, int& incx)
-{
-    csscal_(&n, &da, (std::complex<float>*)&zx, &incx);
-}
-
-static void call_scal(int& n, float& da, std::complex<float>& zx, int& incx)
-{
-    csscal_(&n, &da, &zx, &incx);
-}
-
-static void call_scal(int& n, double& da, double& zx, int& incx)
-{
-    dscal_(&n, &da, &zx, &incx);
-}
-
-static void call_scal(int& n, float& da, float& zx, int& incx)
-{
-    sscal_(&n, &da, &zx, &incx);
-}
-#else
 template <typename T, typename S, typename I>
 static void call_scal(I& n, S& a, T& x_in, I& incx)
 {
@@ -1576,64 +1133,6 @@ static void call_scal(I& n, S& a, T& x_in, I& incx)
     };
 }
 
-#endif
-
-#ifdef USE_LAPACK
-static void call_rot(int& n,
-                     std::complex<float>& zx,
-                     int& incx,
-                     std::complex<float>& zy,
-                     int& incy,
-                     float& c,
-                     float& s)
-{
-    csrot_(&n, &zx, &incx, &zy, &incy, &c, &s);
-}
-
-static void call_rot(int& n,
-                     rocblas_complex_num<float>& zx,
-                     int& incx,
-                     rocblas_complex_num<float>& zy,
-                     int& incy,
-                     float& c,
-                     float& s)
-{
-    csrot_(&n, (std::complex<float>*)&zx, &incx, (std::complex<float>*)&zy, &incy, &c, &s);
-}
-
-static void call_rot(int& n,
-                     std::complex<double>& zx,
-                     int& incx,
-                     std::complex<double>& zy,
-                     int& incy,
-                     double& c,
-                     double& s)
-{
-    zdrot_(&n, &zx, &incx, &zy, &incy, &c, &s);
-}
-
-static void call_rot(int& n,
-                     rocblas_complex_num<double>& zx,
-                     int& incx,
-                     rocblas_complex_num<double>& zy,
-                     int& incy,
-                     double& c,
-                     double& s)
-{
-    zdrot_(&n, (std::complex<double>*)&zx, &incx, (std::complex<double>*)&zy, &incy, &c, &s);
-}
-
-static void call_rot(int& n, double& dx, int& incx, double& dy, int& incy, double& c, double& s)
-{
-    drot_(&n, &dx, &incx, &dy, &incy, &c, &s);
-}
-
-static void call_rot(int& n, float& dx, int& incx, float& dy, int& incy, float& c, float& s)
-{
-    srot_(&n, &dx, &incx, &dy, &incy, &c, &s);
-}
-#else
-
 template <typename T, typename S, typename I>
 static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s)
 {
@@ -1651,35 +1150,6 @@ static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s)
     }
 }
 
-#endif
-
-#ifdef USE_LAPACK
-static void call_lasv2(double& f,
-                       double& g,
-                       double& h,
-                       double& ssmin,
-                       double& ssmax,
-                       double& snr,
-                       double& csr,
-                       double& snl,
-                       double& csl)
-{
-    dlasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl);
-}
-
-static void call_lasv2(float& f,
-                       float& g,
-                       float& h,
-                       float& ssmin,
-                       float& ssmax,
-                       float& snr,
-                       float& csr,
-                       float& snl,
-                       float& csl)
-{
-    slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl);
-}
-#else
 // --------------------------------------------------------
 // lasv2 computes the singular value decomposition of a 2 x 2
 // triangular matrix
@@ -1899,90 +1369,6 @@ static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T&
     ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h));
 }
 
-#endif
-
-static void call_lasq1(int& n, double& D_, double& E_, double& rwork_, int& info_arg)
-{
-    dlasq1_(&n, &D_, &E_, &rwork_, &info_arg);
-};
-
-static void call_lasq1(int& n, float& D_, float& E_, float& rwork_, int& info_arg)
-{
-    slasq1_(&n, &D_, &E_, &rwork_, &info_arg);
-};
-
-#ifdef USE_LAPACK
-static void call_lasr(char& side,
-                      char& pivot,
-                      char& direct,
-                      int& m,
-                      int& n,
-                      float& c,
-                      float& s,
-                      rocblas_complex_num<float>& A,
-                      int& lda)
-{
-    clasr_(&side, &pivot, &direct, &m, &n, &c, &s, (std::complex<float>*)&A, &lda);
-};
-
-static void call_lasr(char& side,
-                      char& pivot,
-                      char& direct,
-                      int& m,
-                      int& n,
-                      double& c,
-                      double& s,
-                      rocblas_complex_num<double>& A,
-                      int& lda)
-{
-    zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, (std::complex<double>*)&A, &lda);
-};
-
-static void call_lasr(char& side,
-                      char& pivot,
-                      char& direct,
-                      int& m,
-                      int& n,
-                      double& c,
-                      double& s,
-                      std::complex<double>& A,
-                      int& lda)
-{
-    zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
-};
-
-static void call_lasr(char& side,
-                      char& pivot,
-                      char& direct,
-                      int& m,
-                      int& n,
-                      float& c,
-                      float& s,
-                      std::complex<float>& A,
-                      int& lda)
-{
-    clasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
-};
-
-static void
-    call_lasr(char& side, char& pivot, char& direct, int& m, int& n, float& c, float& s, float& A, int& lda)
-{
-    slasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
-};
-
-static void call_lasr(char& side,
-                      char& pivot,
-                      char& direct,
-                      int& m,
-                      int& n,
-                      double& c,
-                      double& s,
-                      double& A,
-                      int& lda)
-{
-    dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda);
-};
-#else
 template <typename S, typename T, typename I>
 static void call_lasr(char& side, char& pivot, char& direct, I& m, I& n, S& c, S& s, T& A, I& lda)
 {
@@ -1992,8 +1378,6 @@ static void call_lasr(char& side, char& pivot, char& direct, I& m, I& n, S& c, S
     lasr_body<S, T, I>(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc);
 };
 
-#endif
-
 template <typename S, typename T, typename I>
 static void bdsqr_single_template(char uplo,
                                   I n,
@@ -2025,13 +1409,6 @@ static void bdsqr_single_template(char uplo,
     // -----------------------------------
     bool constexpr need_sort = false;
 
-    // ---------------------------------------------------
-    // NOTE: lasq1 may return non-zero info value that
-    // has a different meaning
-    // Consider turning off lasq1 to have consistent info value
-    // ---------------------------------------------------
-    bool constexpr use_lasq1 = false;
-
     S const zero = 0;
     S const one = 1;
     S negone = -1;
@@ -2194,19 +1571,6 @@ static void bdsqr_single_template(char uplo,
 
     if(n == 1)
         goto L160;
-    /*
-   *     if no singular vectors desired, use qd algorithm
-   */
-    if((!rotate) && (use_lasq1))
-    {
-        call_lasq1(n, d(1), e(1), work(1), info);
-        /*
-     *     if info equals 2, dqds didn't finish, try to finish
-     */
-        if(info != 2)
-            return;
-        info = 0;
-    }
 
     nm1 = n - 1;
     nm12 = nm1 + nm1;
@@ -3680,40 +3044,14 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         I nrv = n;
         I ncvt = nv;
         bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0);
-        bool const use_lapack_bdsqr = false;
-
-        if((use_lapack_bdsqr) && (values_only))
-        {
-            // --------------------------------
-            // call the lapack version of bdsqr
-            // --------------------------------
-            auto ln = n;
-            auto lncvt = ncvt;
-            auto lnru = nru;
-            auto lncc = ncc;
-            S& d_arg = d_[0];
-            S& e_arg = e_[0];
-            T& vt_arg = vt_[0];
-            T& u_arg = u_[0];
-            T& c_arg = c_[0];
-            S& work_arg = work_[0];
-            auto ldvt_arg = ldvt;
-            auto ldu_arg = ldu;
-            auto ldc_arg = ldc;
-
-            call_bdsqr(uplo, ln, lncvt, lnru, lncc, d_arg, e_arg, vt_arg, ldvt_arg, u_arg, ldu_arg,
-                       c_arg, ldu_arg, work_arg, info);
-        }
-        else
-        {
-            bdsqr_single_template<S, T, I>(uplo, n, ncvt, nru, ncc,
 
-                                           d_, e_,
+        bdsqr_single_template<S, T, I>(uplo, n, ncvt, nru, ncc,
 
-                                           vt_, ldvt, u_, ldu, c_, ldc,
+                                       d_, e_,
 
-                                           work_, info, dwork_, stream);
-        }
+                                       vt_, ldvt, u_, ldu, c_, ldc,
+
+                                       work_, info, dwork_, stream);
 
         if(info == 0)
         {
diff --git a/library/src/common/rocsolver_handle.cpp b/library/src/common/rocsolver_handle.cpp
new file mode 100644
index 000000000..949d60fd2
--- /dev/null
+++ b/library/src/common/rocsolver_handle.cpp
@@ -0,0 +1,132 @@
+/* **************************************************************************
+ * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#include "rocsolver_handle.hpp"
+#include "rocblas.hpp"
+
+#include <memory>
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+rocblas_status rocsolver_set_alg_mode_impl(rocblas_handle handle,
+                                           const rocsolver_function func,
+                                           const rocsolver_alg_mode mode)
+{
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    std::shared_ptr<void> handle_ptr;
+    ROCBLAS_CHECK(rocblas_internal_get_data_ptr(handle, handle_ptr));
+    rocsolver_handle_data handle_data = (rocsolver_handle_data)handle_ptr.get();
+
+    if(handle_data == nullptr)
+    {
+        handle_ptr = std::make_shared<rocsolver_handle_data_>();
+        handle_data = (rocsolver_handle_data)handle_ptr.get();
+        handle_data->checksum = sizeof(rocsolver_handle_data_);
+
+        ROCBLAS_CHECK(rocblas_internal_set_data_ptr(handle, handle_ptr));
+    }
+    else
+    {
+        if(handle_data->checksum != sizeof(rocsolver_handle_data_))
+            return rocblas_status_internal_error;
+    }
+
+    switch(func)
+    {
+    case rocsolver_function_gesvd:
+    case rocsolver_function_bdsqr:
+        if(mode == rocsolver_alg_mode_gpu || mode == rocsolver_alg_mode_hybrid)
+        {
+            handle_data->bdsqr_mode = mode;
+            return rocblas_status_success;
+        }
+    }
+
+    return rocblas_status_invalid_value;
+}
+
+rocblas_status rocsolver_get_alg_mode_impl(rocblas_handle handle,
+                                           const rocsolver_function func,
+                                           rocsolver_alg_mode* mode)
+{
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    std::shared_ptr<void> handle_ptr;
+    ROCBLAS_CHECK(rocblas_internal_get_data_ptr(handle, handle_ptr));
+    rocsolver_handle_data handle_data = (rocsolver_handle_data)handle_ptr.get();
+
+    if(handle_data == nullptr)
+    {
+        *mode = rocsolver_alg_mode_gpu;
+    }
+    else
+    {
+        if(handle_data->checksum != sizeof(rocsolver_handle_data_))
+            return rocblas_status_internal_error;
+
+        switch(func)
+        {
+        case rocsolver_function_gesvd:
+        case rocsolver_function_bdsqr: *mode = handle_data->bdsqr_mode; break;
+        default: return rocblas_status_invalid_value;
+        }
+    }
+
+    return rocblas_status_success;
+}
+
+ROCSOLVER_END_NAMESPACE
+
+extern "C" {
+
+rocblas_status rocsolver_set_alg_mode(rocblas_handle handle,
+                                      const rocsolver_function func,
+                                      const rocsolver_alg_mode mode)
+try
+{
+    return rocsolver::rocsolver_set_alg_mode_impl(handle, func, mode);
+}
+catch(...)
+{
+    return rocsolver::exception_to_rocblas_status();
+}
+
+rocblas_status rocsolver_get_alg_mode(rocblas_handle handle,
+                                      const rocsolver_function func,
+                                      rocsolver_alg_mode* mode)
+try
+{
+    return rocsolver::rocsolver_get_alg_mode_impl(handle, func, mode);
+}
+catch(...)
+{
+    return rocsolver::exception_to_rocblas_status();
+}
+}
diff --git a/library/src/include/rocsolver_handle.hpp b/library/src/include/rocsolver_handle.hpp
new file mode 100644
index 000000000..527bac90c
--- /dev/null
+++ b/library/src/include/rocsolver_handle.hpp
@@ -0,0 +1,44 @@
+/* **************************************************************************
+ * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include "common_host_helpers.hpp"
+#include "rocsolver/rocsolver.h"
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+struct rocsolver_handle_data_
+{
+    rocblas_int checksum;
+
+    rocsolver_alg_mode bdsqr_mode;
+};
+
+typedef struct rocsolver_handle_data_* rocsolver_handle_data;
+
+ROCSOLVER_END_NAMESPACE

From ccec4f920875e9d24ec45a66e44c92e1fdb6a26e Mon Sep 17 00:00:00 2001
From: Troy Alderson <58866654+tfalders@users.noreply.github.com>
Date: Thu, 7 Nov 2024 13:07:30 -0700
Subject: [PATCH 25/35] Addressed review comment

---
 library/src/include/rocsolver_handle.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/src/include/rocsolver_handle.hpp b/library/src/include/rocsolver_handle.hpp
index 527bac90c..2e4177556 100644
--- a/library/src/include/rocsolver_handle.hpp
+++ b/library/src/include/rocsolver_handle.hpp
@@ -36,7 +36,7 @@ struct rocsolver_handle_data_
 {
     rocblas_int checksum;
 
-    rocsolver_alg_mode bdsqr_mode;
+    rocsolver_alg_mode bdsqr_mode = rocsolver_alg_mode_gpu;
 };
 
 typedef struct rocsolver_handle_data_* rocsolver_handle_data;

From 39a9d83dd5c0041e0988df0f7cb1b83f7d4790dd Mon Sep 17 00:00:00 2001
From: Troy Alderson <58866654+tfalders@users.noreply.github.com>
Date: Thu, 7 Nov 2024 13:27:16 -0700
Subject: [PATCH 26/35] Minor test improvement

---
 clients/common/auxiliary/testing_bdsqr.hpp | 8 ++++++++
 clients/common/lapack/testing_gesvd.hpp    | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/clients/common/auxiliary/testing_bdsqr.hpp b/clients/common/auxiliary/testing_bdsqr.hpp
index 5946bb93a..058a923b4 100644
--- a/clients/common/auxiliary/testing_bdsqr.hpp
+++ b/clients/common/auxiliary/testing_bdsqr.hpp
@@ -479,10 +479,18 @@ void testing_bdsqr(Arguments& argus)
     rocblas_int hot_calls = argus.iters;
 
     if(argus.alg_mode)
+    {
         EXPECT_ROCBLAS_STATUS(
             rocsolver_set_alg_mode(handle, rocsolver_function_bdsqr, rocsolver_alg_mode_hybrid),
             rocblas_status_success);
 
+        rocsolver_alg_mode alg_mode;
+        EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_bdsqr, &alg_mode),
+                              rocblas_status_success);
+
+        EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid);
+    }
+
     // check non-supported values
     if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower)
     {
diff --git a/clients/common/lapack/testing_gesvd.hpp b/clients/common/lapack/testing_gesvd.hpp
index 8f5745826..de7fa10d3 100644
--- a/clients/common/lapack/testing_gesvd.hpp
+++ b/clients/common/lapack/testing_gesvd.hpp
@@ -549,10 +549,18 @@ void testing_gesvd(Arguments& argus)
     rocblas_int hot_calls = argus.iters;
 
     if(argus.alg_mode)
+    {
         EXPECT_ROCBLAS_STATUS(
             rocsolver_set_alg_mode(handle, rocsolver_function_gesvd, rocsolver_alg_mode_hybrid),
             rocblas_status_success);
 
+        rocsolver_alg_mode alg_mode;
+        EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_gesvd, &alg_mode),
+                              rocblas_status_success);
+
+        EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid);
+    }
+
     // check non-supported values
     if(rightv == rocblas_svect_overwrite && leftv == rocblas_svect_overwrite)
     {

From c05e1e42e5d0d4674c855ec7e25080962c599d80 Mon Sep 17 00:00:00 2001
From: Troy Alderson <58866654+tfalders@users.noreply.github.com>
Date: Thu, 7 Nov 2024 14:12:07 -0700
Subject: [PATCH 27/35] Use new lasr template

---
 .../auxiliary/rocauxiliary_bdsqr_hybrid.hpp   | 696 +++---------------
 1 file changed, 82 insertions(+), 614 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
index d304c8f61..797d40140 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
@@ -39,6 +39,8 @@
 #include <cstdint>
 #include <stdio.h>
 
+#include "rocauxiliary_lasr.hpp"
+
 #include "hip/hip_runtime.h"
 #include "hip/hip_runtime_api.h"
 
@@ -52,547 +54,6 @@ ROCSOLVER_BEGIN_NAMESPACE
     }
 #endif
 
-#ifndef LASR_MAX_NTHREADS
-#define LASR_MAX_NTHREADS 64
-#endif
-
-template <typename S, typename T, typename I>
-__host__ __device__ static void lasr_body(char const side,
-                                          char const pivot,
-                                          char const direct,
-                                          I const m,
-                                          I const n,
-                                          S const* const __restrict__ c_,
-                                          S const* const __restrict__ s_,
-                                          T* const __restrict__ A_,
-                                          I const lda,
-                                          I const tid,
-                                          I const i_inc)
-{
-    auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
-    auto min = [](auto x, auto y) { return ((x < y) ? x : y); };
-
-    auto indx2f = [](auto i, auto j, auto lda) {
-        assert((1 <= i));
-        assert((1 <= lda));
-        assert((1 <= j));
-
-        return (i + j * lda - (1 + lda));
-    };
-
-    auto indx1f = [](auto i) -> int64_t {
-        assert((1 <= i));
-        return (i - (1));
-    };
-
-    auto c = [&](auto i) -> const S { return (c_[(i)-1]); };
-    auto s = [&](auto i) -> const S { return (s_[(i)-1]); };
-    auto A = [&](auto i, auto j) -> T& { return (A_[indx2f(i, j, lda)]); };
-
-    const S one = 1;
-    const S zero = 0;
-
-    constexpr bool use_reorder = true;
-
-    // ----------------
-    // check arguments
-    // ----------------
-
-    const bool is_side_Left = (side == 'L') || (side == 'l');
-    const bool is_side_Right = (side == 'R') || (side == 'r');
-
-    const bool is_pivot_Variable = (pivot == 'V') || (pivot == 'v');
-    const bool is_pivot_Bottom = (pivot == 'B') || (pivot == 'b');
-    const bool is_pivot_Top = (pivot == 'T') || (pivot == 't');
-
-    const bool is_direct_Forward = (direct == 'F') || (direct == 'f');
-    const bool is_direct_Backward = (direct == 'B') || (direct == 'b');
-
-    {
-        const bool isok_side = is_side_Left || is_side_Right;
-        const bool isok_pivot = is_pivot_Variable || is_pivot_Bottom || is_pivot_Top;
-        const bool isok_direct = is_direct_Forward || is_direct_Backward;
-
-        const I info = (!isok_side) ? 1
-            : (!isok_pivot)         ? 2
-            : (!isok_direct)        ? 3
-            : (m < 0)               ? 4
-            : (n < 0)               ? 5
-            : (c_ == nullptr)       ? 6
-            : (s_ == nullptr)       ? 7
-            : (A_ == nullptr)       ? 8
-            : (lda < max(1, m))     ? 9
-                                    : 0;
-        if(info != 0)
-            return;
-    };
-
-    {
-        const bool has_work = (m >= 1) && (n >= 1);
-        if(!has_work)
-        {
-            return;
-        };
-    };
-
-    if(is_side_Left && is_pivot_Variable && is_direct_Forward)
-    {
-        //  -----------------------------
-        //  A := P*A
-        //  Variable pivot, the plane (k,k+1)
-        //  P = P(z-1) * ... * P(2) * P(1)
-        //  -----------------------------
-        {
-            if constexpr(use_reorder)
-            {
-                for(I i = 1 + tid; i <= n; i += i_inc)
-                {
-                    for(I j = 1; j <= (m - 1); j++)
-                    {
-                        const auto ctemp = c(j);
-                        const auto stemp = s(j);
-                        const auto temp = A(j + 1, i);
-                        A(j + 1, i) = ctemp * temp - stemp * A(j, i);
-                        A(j, i) = stemp * temp + ctemp * A(j, i);
-                    }
-                }
-            }
-            else
-            {
-                for(I j = 1; j <= (m - 1); j++)
-                {
-                    const auto ctemp = c(j);
-                    const auto stemp = s(j);
-                    if((ctemp != one) || (stemp != zero))
-                    {
-                        for(I i = 1 + tid; i <= n; i += i_inc)
-                        {
-                            const auto temp = A(j + 1, i);
-                            A(j + 1, i) = ctemp * temp - stemp * A(j, i);
-                            A(j, i) = stemp * temp + ctemp * A(j, i);
-                        }
-                    }
-                }
-            }
-        }
-
-        return;
-    };
-
-    if(is_side_Left && is_pivot_Variable && is_direct_Backward)
-    {
-        //  -----------------------------
-        //  A := P*A
-        //  Variable pivot, the plane (k,k+1)
-        //  P = P(1)*P(2)*...*P(z-1)
-        //  -----------------------------
-
-        auto const jend = (m - 1);
-        auto const jstart = 1;
-        auto const istart = 1;
-        auto const iend = n;
-
-        if constexpr(use_reorder)
-        {
-            for(I i = istart + tid; i <= iend; i += i_inc)
-            {
-                for(I j = jend; j >= jstart; j--)
-                {
-                    const auto ctemp = c(j);
-                    const auto stemp = s(j);
-                    const auto temp = A(j + 1, i);
-                    A(j + 1, i) = ctemp * temp - stemp * A(j, i);
-                    A(j, i) = stemp * temp + ctemp * A(j, i);
-                }
-            }
-        }
-        else
-        {
-            for(I j = jend; j >= jstart; j--)
-            {
-                const auto ctemp = c(j);
-                const auto stemp = s(j);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(j + 1, i);
-                        A(j + 1, i) = ctemp * temp - stemp * A(j, i);
-                        A(j, i) = stemp * temp + ctemp * A(j, i);
-                    }
-                }
-            }
-        }
-
-        return;
-    };
-
-    if(is_side_Left && is_pivot_Top && is_direct_Forward)
-    {
-        //  -----------------------------
-        //  A := P*A
-        //  Top pivot, the plane (1,k+1)
-        //  P = P(z-1) * ... * P(2) * P(1)
-        //  -----------------------------
-        {
-            for(I j = 2; j <= m; j++)
-            {
-                const auto ctemp = c(j - 1);
-                const auto stemp = s(j - 1);
-                for(I i = 1 + tid; i <= n; i += i_inc)
-                {
-                    const auto temp = A(j, i);
-                    A(j, i) = ctemp * temp - stemp * A(1, i);
-                    A(1, i) = stemp * temp + ctemp * A(1, i);
-                };
-            };
-        };
-
-        return;
-    };
-
-    if(is_side_Left && is_pivot_Top && is_direct_Backward)
-    {
-        //  -----------------------------
-        //  A := P*A
-        //  Top pivot, the plane (1,k+1)
-        //  P = P(1)*P(2)*...*P(z-1)
-        //  -----------------------------
-        {
-            auto const jend = m;
-            auto const jstart = 2;
-            auto const istart = 1;
-            auto const iend = n;
-
-            for(I j = jend; j >= jstart; j--)
-            {
-                const auto ctemp = c(j - 1);
-                const auto stemp = s(j - 1);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(j, i);
-
-                        A(j, i) = ctemp * temp - stemp * A(1, i);
-                        A(1, i) = stemp * temp + ctemp * A(1, i);
-                    };
-                };
-            };
-        }
-
-        return;
-    };
-
-    if(is_side_Left && is_pivot_Bottom && is_direct_Forward)
-    {
-        //  -----------------------------
-        //  A := P*A
-        //  Bottom pivot, the plane (k,z)
-        //  P = P(z-1) * ... * P(2) * P(1)
-        //  -----------------------------
-        {
-            auto const jstart = 1;
-            auto const jend = (m - 1);
-            auto const istart = 1;
-            auto const iend = n;
-
-            for(I j = jstart; j <= jend; j++)
-            {
-                const auto ctemp = c(j);
-                const auto stemp = s(j);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(j, i);
-                        A(j, i) = stemp * A(m, i) + ctemp * temp;
-                        A(m, i) = ctemp * A(m, i) - stemp * temp;
-                    };
-                };
-            };
-        }
-
-        return;
-    };
-
-    if(is_side_Left && is_pivot_Bottom && is_direct_Backward)
-    {
-        //  -----------------------------
-        //  A := P*A
-        //  Bottom pivot, the plane (k,z)
-        //  P = P(1)*P(2)*...*P(z-1)
-        //  -----------------------------
-        {
-            auto const jend = (m - 1);
-            auto const jstart = 1;
-            auto const istart = 1;
-            auto const iend = n;
-
-            for(I j = jend; j >= jstart; j--)
-            {
-                const auto ctemp = c(j);
-                const auto stemp = s(j);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(j, i);
-                        A(j, i) = stemp * A(m, i) + ctemp * temp;
-                        A(m, i) = ctemp * A(m, i) - stemp * temp;
-                    };
-                };
-            };
-        }
-
-        return;
-    };
-
-    if(is_side_Right && is_pivot_Variable && is_direct_Forward)
-    {
-        //  -----------------------------
-        //  A := A*P**T
-        //  Variable pivot, the plane (k,k+1)
-        //  P = P(z-1) * ... * P(2) * P(1)
-        //  -----------------------------
-
-        {
-            auto const jstart = 1;
-            auto const jend = (n - 1);
-            auto const istart = 1;
-            auto const iend = m;
-
-            for(I j = jstart; j <= jend; j++)
-            {
-                const auto ctemp = c(j);
-                const auto stemp = s(j);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(i, j + 1);
-                        A(i, j + 1) = ctemp * temp - stemp * A(i, j);
-                        A(i, j) = stemp * temp + ctemp * A(i, j);
-                    };
-                };
-            };
-        }
-
-        return;
-    };
-
-    if(is_side_Right && is_pivot_Variable && is_direct_Backward)
-    {
-        //  -----------------------------
-        //  A := A*P**T
-        //  Variable pivot, the plane (k,k+1)
-        //  P = P(1)*P(2)*...*P(z-1)
-        //  -----------------------------
-
-        {
-            auto const jend = (n - 1);
-            auto const jstart = 1;
-            auto const istart = 1;
-            auto const iend = m;
-
-            for(I j = jend; j >= jstart; j--)
-            {
-                const auto ctemp = c(j);
-                const auto stemp = s(j);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(i, j + 1);
-                        A(i, j + 1) = ctemp * temp - stemp * A(i, j);
-                        A(i, j) = stemp * temp + ctemp * A(i, j);
-                    };
-                };
-            };
-        }
-        return;
-    };
-
-    if(is_side_Right && is_pivot_Top && is_direct_Forward)
-    {
-        //  -----------------------------
-        //  A := A*P**T
-        //  Top pivot, the plane (1,k+1)
-        //  P = P(z-1) * ... * P(2) * P(1)
-        //  -----------------------------
-
-        {
-            auto const jstart = 2;
-            auto const jend = n;
-            auto const istart = 1;
-            auto const iend = m;
-
-            for(I j = jstart; j <= jend; j++)
-            {
-                const auto ctemp = c(j - 1);
-                const auto stemp = s(j - 1);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(i, j);
-
-                        A(i, j) = ctemp * temp - stemp * A(i, 1);
-                        A(i, 1) = stemp * temp + ctemp * A(i, 1);
-                    };
-                };
-            };
-        }
-
-        return;
-    };
-
-    if(is_side_Right && is_pivot_Top && is_direct_Backward)
-    {
-        //  -----------------------------
-        //  A := A*P**T
-        //  Top pivot, the plane (1,k+1)
-        //  P = P(1)*P(2)*...*P(z-1)
-        //  -----------------------------
-
-        {
-            auto const jend = n;
-            auto const jstart = 2;
-            auto const istart = 1;
-            auto const iend = m;
-
-            for(I j = jend; j >= jstart; j--)
-            {
-                const auto ctemp = c(j - 1);
-                const auto stemp = s(j - 1);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(i, j);
-
-                        A(i, j) = ctemp * temp - stemp * A(i, 1);
-                        A(i, 1) = stemp * temp + ctemp * A(i, 1);
-                    };
-                };
-            };
-        }
-
-        return;
-    };
-
-    if(is_side_Right && is_pivot_Bottom && is_direct_Forward)
-    {
-        //  -----------------------------
-        //  A := A*P**T
-        //  Bottom pivot, the plane (k,z)
-        //  P = P(z-1) * ... * P(2) * P(1)
-        //  -----------------------------
-
-        {
-            auto const jstart = 1;
-            auto const jend = (n - 1);
-            auto const istart = 1;
-            auto const iend = m;
-
-            for(I j = jstart; j <= jend; j++)
-            {
-                const auto ctemp = c(j);
-                const auto stemp = s(j);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(i, j);
-
-                        A(i, j) = stemp * A(i, n) + ctemp * temp;
-                        A(i, n) = ctemp * A(i, n) - stemp * temp;
-                    };
-                };
-            };
-        }
-
-        return;
-    };
-
-    if(is_side_Right && is_pivot_Bottom && is_direct_Backward)
-    {
-        //  -----------------------------
-        //  A := A*P**T
-        //  Bottom pivot, the plane (k,z)
-        //  P = P(1)*P(2)*...*P(z-1)
-        //  -----------------------------
-
-        {
-            auto const jend = (n - 1);
-            auto const jstart = 1;
-            auto const istart = 1;
-            auto const iend = m;
-
-            for(I j = jend; j >= jstart; j--)
-            {
-                const auto ctemp = c(j);
-                const auto stemp = s(j);
-                if((ctemp != one) || (stemp != zero))
-                {
-                    for(I i = istart + tid; i <= iend; i += i_inc)
-                    {
-                        const auto temp = A(i, j);
-                        A(i, j) = stemp * A(i, n) + ctemp * temp;
-                        A(i, n) = ctemp * A(i, n) - stemp * temp;
-                    };
-                };
-            };
-        }
-
-        return;
-    };
-
-    return;
-}
-
-template <typename S, typename T, typename I>
-__global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char const side,
-                                                                        char const pivot,
-                                                                        char const direct,
-                                                                        I const m,
-                                                                        I const n,
-                                                                        S const* const c_,
-                                                                        S const* const s_,
-                                                                        T* const A_,
-                                                                        I const lda)
-{
-    const auto nblocks = hipGridDim_x;
-    const auto nthreads_per_block = hipBlockDim_x;
-    const auto nthreads = nblocks * nthreads_per_block;
-    I const tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-    I const i_inc = nthreads;
-
-    lasr_body<S, T, I>(side, pivot, direct, m, n, c_, s_, A_, lda, tid, i_inc);
-}
-
-template <typename S, typename T, typename I>
-static void lasr_template_gpu(char const side,
-                              char const pivot,
-                              char const direct,
-                              I const m,
-                              I const n,
-                              S const* const c_,
-                              S const* const s_,
-                              T* const A_,
-                              I const lda,
-                              hipStream_t stream = 0)
-{
-    auto const nthreads = LASR_MAX_NTHREADS;
-
-    bool const is_left_side = (side == 'L') || (side == 'l');
-    auto const mn = (is_left_side) ? n : m;
-
-    auto const nblocks = (mn - 1) / nthreads + 1;
-    hipLaunchKernelGGL((lasr_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
-                       side, pivot, direct, m, n, c_, s_, A_, lda);
-}
-
 template <typename S, typename T, typename I>
 __global__ static void
     rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s)
@@ -1370,16 +831,25 @@ static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T&
 }
 
 template <typename S, typename T, typename I>
-static void call_lasr(char& side, char& pivot, char& direct, I& m, I& n, S& c, S& s, T& A, I& lda)
+static void call_lasr(rocblas_side& side,
+                      rocblas_pivot& pivot,
+                      rocblas_direct& direct,
+                      I& m,
+                      I& n,
+                      S& c,
+                      S& s,
+                      T& A,
+                      I& lda)
 {
     I const tid = 0;
     I const i_inc = 1;
 
-    lasr_body<S, T, I>(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc);
+    lasr_body<T, S, I>(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc);
 };
 
 template <typename S, typename T, typename I>
-static void bdsqr_single_template(char uplo,
+static void bdsqr_single_template(rocblas_handle handle,
+                                  char uplo,
                                   I n,
                                   I ncvt,
                                   I nru,
@@ -1463,32 +933,35 @@ static void bdsqr_single_template(char uplo,
     auto call_scal_gpu
         = [=](I n, auto da, T& x, I incx) { scal_template<S, T, I>(n, da, &x, incx, stream); };
 
-    auto call_lasr_gpu_nocopy = [=](char const side, char const pivot, char const direct, I const m,
-                                    I const n, S& dc, S& ds, T& A, I const lda, hipStream_t stream) {
-        bool const is_left_side = (side == 'L') || (side == 'l');
-        auto const mn = (is_left_side) ? m : n;
-        auto const mn_m1 = (mn - 1);
-
-        lasr_template_gpu(side, pivot, direct, m, n, &dc, &ds, &A, lda, stream);
-    };
-
-    auto call_lasr_gpu
-        = [=](char const side, char const pivot, char const direct, I const m, I const n, S& c,
-              S& s, T& A, I const lda, S* const dwork_, hipStream_t stream) {
-              bool const is_left_side = (side == 'L') || (side == 'l');
+    auto call_lasr_gpu_nocopy
+        = [=](rocblas_side const side, rocblas_pivot const pivot, rocblas_direct const direct,
+              I const m, I const n, S& dc, S& ds, T& A, I const lda, hipStream_t stream) {
+              bool const is_left_side = (side == rocblas_side_left);
               auto const mn = (is_left_side) ? m : n;
               auto const mn_m1 = (mn - 1);
-              S* const dc = dwork_;
-              S* const ds = dwork_ + mn_m1;
-              CHECK_HIP(hipStreamSynchronize(stream));
 
-              CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
-              CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
-
-              lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream);
-              CHECK_HIP(hipStreamSynchronize(stream));
+              rocsolver_lasr_template<T, S>(handle, side, pivot, direct, m, n, &dc, 0, &ds, 0, &A,
+                                            0, lda, 0, I(1));
           };
 
+    auto call_lasr_gpu = [=](rocblas_side const side, rocblas_pivot const pivot,
+                             rocblas_direct const direct, I const m, I const n, S& c, S& s, T& A,
+                             I const lda, S* const dwork_, hipStream_t stream) {
+        bool const is_left_side = (side == rocblas_side_left);
+        auto const mn = (is_left_side) ? m : n;
+        auto const mn_m1 = (mn - 1);
+        S* const dc = dwork_;
+        S* const ds = dwork_ + mn_m1;
+        CHECK_HIP(hipStreamSynchronize(stream));
+
+        CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
+        CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
+
+        rocsolver_lasr_template<T, S>(handle, side, pivot, direct, m, n, dc, 0, ds, 0, &A, 0, lda,
+                                      0, I(1));
+        CHECK_HIP(hipStreamSynchronize(stream));
+    };
+
     auto abs = [](auto x) { return (std::abs(x)); };
 
     auto indx2f = [](auto i, auto j, auto ld) -> int64_t {
@@ -1639,9 +1112,9 @@ static void bdsqr_single_template(char uplo,
         if(nru > 0)
         {
             // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu );
-            char side = 'R';
-            char pivot = 'V';
-            char direct = 'F';
+            rocblas_side side = rocblas_side_right;
+            rocblas_pivot pivot = rocblas_pivot_variable;
+            rocblas_direct direct = rocblas_forward_direction;
             if(use_gpu)
             {
                 if(use_lasr_gpu_nocopy)
@@ -1663,9 +1136,9 @@ static void bdsqr_single_template(char uplo,
         if(ncc > 0)
         {
             // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc );
-            char side = 'L';
-            char pivot = 'V';
-            char direct = 'F';
+            rocblas_side side = rocblas_side_left;
+            rocblas_pivot pivot = rocblas_pivot_variable;
+            rocblas_direct direct = rocblas_forward_direction;
             if(use_gpu)
             {
                 if(use_lasr_gpu_nocopy)
@@ -2095,9 +1568,9 @@ static void bdsqr_single_template(char uplo,
             {
                 // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
                 // ll, 1 ), ldvt )
-                char side = 'L';
-                char pivot = 'V';
-                char direct = 'F';
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2121,9 +1594,9 @@ static void bdsqr_single_template(char uplo,
             {
                 // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
                 // ), u( 1, ll ), ldu )
-                char side = 'R';
-                char pivot = 'V';
-                char direct = 'F';
+                rocblas_side side = rocblas_side_right;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2148,9 +1621,9 @@ static void bdsqr_single_template(char uplo,
             {
                 // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
                 // ), c( ll, 1 ), ldc )
-                char side = 'L';
-                char pivot = 'V';
-                char direct = 'F';
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2262,9 +1735,9 @@ static void bdsqr_single_template(char uplo,
                 // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
                 // nm13+1
                 // ), vt( ll, 1 ), ldvt );
-                char side = 'L';
-                char pivot = 'V';
-                char direct = 'B';
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2290,9 +1763,9 @@ static void bdsqr_single_template(char uplo,
                 // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
                 // ll
                 // ), ldu )
-                char side = 'R';
-                char pivot = 'V';
-                char direct = 'B';
+                rocblas_side side = rocblas_side_right;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2317,9 +1790,9 @@ static void bdsqr_single_template(char uplo,
                 // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
                 // 1
                 // ), ldc )
-                char side = 'L';
-                char pivot = 'V';
-                char direct = 'B';
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2439,9 +1912,9 @@ static void bdsqr_single_template(char uplo,
             {
                 // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
                 // ll, 1 ), ldvt )
-                char side = 'L';
-                char pivot = 'V';
-                char direct = 'F';
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2466,9 +1939,9 @@ static void bdsqr_single_template(char uplo,
             {
                 // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
                 // ), u( 1, ll ), ldu )
-                char side = 'R';
-                char pivot = 'V';
-                char direct = 'F';
+                rocblas_side side = rocblas_side_right;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2493,9 +1966,9 @@ static void bdsqr_single_template(char uplo,
             {
                 // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
                 // ), c( ll, 1 ), ldc )
-                char side = 'L';
-                char pivot = 'V';
-                char direct = 'F';
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2616,9 +2089,9 @@ static void bdsqr_single_template(char uplo,
                 // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
                 // nm13+1
                 // ), vt( ll, 1 ), ldvt )
-                char side = 'L';
-                char pivot = 'V';
-                char direct = 'B';
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2644,9 +2117,9 @@ static void bdsqr_single_template(char uplo,
                 // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
                 // ll
                 // ), ldu )
-                char side = 'R';
-                char pivot = 'V';
-                char direct = 'B';
+                rocblas_side side = rocblas_side_right;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -2671,9 +2144,9 @@ static void bdsqr_single_template(char uplo,
                 // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
                 // 1
                 // ), ldc )
-                char side = 'L';
-                char pivot = 'V';
-                char direct = 'B';
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
                 auto mm = m - ll + 1;
                 if(use_gpu)
                 {
@@ -3045,13 +2518,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         I ncvt = nv;
         bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0);
 
-        bdsqr_single_template<S, T, I>(uplo, n, ncvt, nru, ncc,
-
-                                       d_, e_,
-
-                                       vt_, ldvt, u_, ldu, c_, ldc,
-
-                                       work_, info, dwork_, stream);
+        bdsqr_single_template<S, T, I>(handle, uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu,
+                                       c_, ldc, work_, info, dwork_, stream);
 
         if(info == 0)
         {

From c3af154ba26102f70b83435a98e2601e4b99d7cb Mon Sep 17 00:00:00 2001
From: Troy Alderson <58866654+tfalders@users.noreply.github.com>
Date: Tue, 12 Nov 2024 11:43:06 -0700
Subject: [PATCH 28/35] Minor docs fix

---
 library/include/rocsolver/rocsolver-extra-types.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/library/include/rocsolver/rocsolver-extra-types.h b/library/include/rocsolver/rocsolver-extra-types.h
index ee2ceb72c..3335f8f3d 100644
--- a/library/include/rocsolver/rocsolver-extra-types.h
+++ b/library/include/rocsolver/rocsolver-extra-types.h
@@ -164,6 +164,8 @@ typedef enum rocsolver_rfinfo_mode_
     = 272, /**< To work with Cholesky factorization (for symmetric positive definite sparse matrices). */
 } rocsolver_rfinfo_mode;
 
+/*! \brief Used to specify the planes on which a sequence of Givens rotations is applied.
+ ********************************************************************************/
 typedef enum rocblas_pivot_
 {
     rocblas_pivot_variable = 281, /**< The i-th rotation is applied on plane (i,i+1). */
@@ -187,7 +189,5 @@ typedef enum rocsolver_function_
     rocsolver_function_bdsqr = 401,
     rocsolver_function_gesvd = 402,
 } rocsolver_function;
-/*! \brief Used to specify the planes on which a sequence of Givens rotations is applied.
- ********************************************************************************/
 
 #endif /* ROCSOLVER_EXTRA_TYPES_H */

From f4bbed3ba448be33cffa9dc3e120771e1cfcbd5d Mon Sep 17 00:00:00 2001
From: Troy Alderson <58866654+tfalders@users.noreply.github.com>
Date: Tue, 12 Nov 2024 15:02:22 -0700
Subject: [PATCH 29/35] Fix benchmark argument consumption error

---
 clients/common/misc/rocsolver_arguments.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clients/common/misc/rocsolver_arguments.hpp b/clients/common/misc/rocsolver_arguments.hpp
index ce6bab452..5e8f14f9b 100644
--- a/clients/common/misc/rocsolver_arguments.hpp
+++ b/clients/common/misc/rocsolver_arguments.hpp
@@ -114,6 +114,7 @@ class Arguments : private std::map<std::string, variable_value>
         to_consume.erase("batch_count");
         to_consume.erase("verify");
         to_consume.erase("iters");
+        to_consume.erase("alg_mode");
         to_consume.erase("mem_query");
         to_consume.erase("profile");
         to_consume.erase("profile_kernels");

From dbb759ae78bb17392f8493ea806851713482e80c Mon Sep 17 00:00:00 2001
From: Juan Zuniga-Anaya <50754207+jzuniga-amd@users.noreply.github.com>
Date: Thu, 14 Nov 2024 10:34:21 -0700
Subject: [PATCH 30/35] review corrections/suggestions 1 (#15)

---
 clients/benchmarks/client.cpp                  |  4 ++--
 clients/common/misc/rocsolver_arguments.hpp    |  1 -
 docs/reference/auxiliary.rst                   |  2 +-
 docs/reference/helpers.rst                     | 13 +++++++++++--
 docs/reference/lapacklike.rst                  |  2 +-
 docs/reference/tuning.rst                      | 14 +++++++++-----
 docs/reference/types.rst                       |  4 ++--
 docs/sphinx/_toc.yml.in                        |  2 +-
 .../include/rocsolver/rocsolver-functions.h    | 18 +++++++++---------
 .../auxiliary/rocauxiliary_bdsqr_hybrid.hpp    |  2 +-
 10 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp
index 378bd99d1..16a244199 100644
--- a/clients/benchmarks/client.cpp
+++ b/clients/benchmarks/client.cpp
@@ -127,9 +127,9 @@ try
 
         ("alg_mode",
          value<rocblas_int>(&argus.alg_mode)->default_value(0),
-            "Select different algorithm mode? 0 = GPU-only, 1 = Hybrid\n"
+            "0 = GPU-only, 1 = Hybrid\n"
             "                           This will change how the algorithm operates.\n"
-            "                           Only applicable to functions with hybrid support."
+            "                           Only applicable to functions with hybrid support.\n"
             "                           ")
 
         ("mem_query",
diff --git a/clients/common/misc/rocsolver_arguments.hpp b/clients/common/misc/rocsolver_arguments.hpp
index 5e8f14f9b..a8ffd9e73 100644
--- a/clients/common/misc/rocsolver_arguments.hpp
+++ b/clients/common/misc/rocsolver_arguments.hpp
@@ -33,7 +33,6 @@
 #include <fmt/ostream.h>
 #include <fmt/ranges.h>
 #include <rocblas/rocblas.h>
-#include <rocsolver/rocsolver.h>
 
 #include "program_options.hpp"
 
diff --git a/docs/reference/auxiliary.rst b/docs/reference/auxiliary.rst
index 83503e7e2..8bebbf116 100644
--- a/docs/reference/auxiliary.rst
+++ b/docs/reference/auxiliary.rst
@@ -58,7 +58,7 @@ rocsolver_<type>lacgv()
 .. doxygenfunction:: rocsolver_zlacgv_64
    :outline:
 .. doxygenfunction:: rocsolver_clacgv_64
-   :outline
+   :outline:
 .. doxygenfunction:: rocsolver_zlacgv
    :outline:
 .. doxygenfunction:: rocsolver_clacgv
diff --git a/docs/reference/helpers.rst b/docs/reference/helpers.rst
index b643b6807..f8051f03f 100644
--- a/docs/reference/helpers.rst
+++ b/docs/reference/helpers.rst
@@ -8,7 +8,16 @@
 rocSOLVER Library and Logging Functions
 *****************************************************
 
+These are helper functions that retrieve information and control some functions of the library.
+The helper functions are divided into the following categories:
 
+* :ref:`lib_info`. These functions return information about the library version.
+* :ref:`algo_select`. Functions to select different algorithm modes of certain APIs.
+* :ref:`api_logging`. These functions control the :ref:`logging-label` capabilities.
+
+
+
+.. _lib_info:
 
 Library information
 ===============================
@@ -27,6 +36,8 @@ rocsolver_get_version_string_size()
 
 
 
+.. _algo_select:
+
 Algorithm selection
 ===============================
 
@@ -49,8 +60,6 @@ rocsolver_get_alg_mode()
 Logging functions
 ===============================
 
-These functions control rocSOLVER's :ref:`logging-label` capabilities.
-
 .. contents:: List of logging functions
    :local:
    :backlinks: top
diff --git a/docs/reference/lapacklike.rst b/docs/reference/lapacklike.rst
index b6b2823c5..68b6881f3 100644
--- a/docs/reference/lapacklike.rst
+++ b/docs/reference/lapacklike.rst
@@ -547,7 +547,7 @@ rocsolver_<type>syevdx_strided_batched()
 rocsolver_<type>heevdx()
 ---------------------------------------------------
 .. doxygenfunction:: rocsolver_zheevdx
-   :outline
+   :outline:
 .. doxygenfunction:: rocsolver_cheevdx
 
 rocsolver_<type>heevdx_batched()
diff --git a/docs/reference/tuning.rst b/docs/reference/tuning.rst
index 82cba73d2..8b59e356a 100644
--- a/docs/reference/tuning.rst
+++ b/docs/reference/tuning.rst
@@ -180,14 +180,18 @@ GEBRD_GEBD2_SWITCHSIZE
 bdsqr function
 ==================
 
-The Singular Value Decomposition of a bidiagonal matrix could be sped up by splitting the matrix into diagonal blocks
-and processing those blocks in parallel.
+The Singular Value Decomposition of a bidiagonal matrix could be executed with one or multiple thread blocks, and it is
+a blocking API that requires synchronization with the host.
 
-BDSQR_SPLIT_GROUPS
+BDSQR_SWITCH_SIZE
 -------------------
-.. doxygendefine:: BDSQR_SPLIT_GROUPS
+.. doxygendefine:: BDSQR_SWITCH_SIZE
 
-(As of the current rocSOLVER release, this constant has not been tuned for any specific cases.)
+BDSQR_ITERS_PER_SYNC
+----------------------
+.. doxygendefine:: BDSQR_ITERS_PER_SYNC
+
+(As of the current rocSOLVER release, this constants have not been tuned for any specific cases.)
 
 
 
diff --git a/docs/reference/types.rst b/docs/reference/types.rst
index 4d881cf18..0fb8d716c 100644
--- a/docs/reference/types.rst
+++ b/docs/reference/types.rst
@@ -73,8 +73,8 @@ rocsolver_rfinfo_mode
 
 rocsolver_alg_mode
 ------------------------
-.. doxygentypedef:: rocsolver_alg_mode
+.. doxygenenum:: rocsolver_alg_mode
 
 rocsolver_function
 ------------------------
-.. doxygentypedef:: rocsolver_function
+.. doxygenenum:: rocsolver_function
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index e5d3606ce..6ad32dfe3 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -23,7 +23,7 @@ subtrees:
       - file: reference/lapack.rst
       - file: reference/lapacklike.rst
       - file: reference/refact.rst
-      - file: reference/logging.rst
+      - file: reference/helpers.rst
       - file: reference/tuning.rst
       - file: reference/deprecated.rst
   - file: license.rst
diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h
index 0c7bfd417..3f340ea70 100644
--- a/library/include/rocsolver/rocsolver-functions.h
+++ b/library/include/rocsolver/rocsolver-functions.h
@@ -147,7 +147,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_log_flush_profile(void);
  * ===========================================================================
  */
 
-/*! \brief SET_ALG_MODE sets the algorithm mode to be used by a specific function.
+/*! \brief SET_ALG_MODE sets the algorithm mode to be used by the specified function.
 
     @param[in]
     handle      rocblas_handle.
@@ -163,17 +163,17 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_set_alg_mode(rocblas_handle handle,
                                                        const rocsolver_function func,
                                                        const rocsolver_alg_mode mode);
 
-/*! \brief GET_ALG_MODE gets the algorithm mode selected for use by a specific function.
+/*! \brief GET_ALG_MODE gets the algorithm mode being used by the specified function.
 
     @param[in]
     handle      rocblas_handle.
     @param[in]
     func        #rocsolver_function.
-                A function.
+                The specified function.
     @param[out]
     mode        pointer to #rocsolver_alg_mode.
-                On exit, the value is overwritten by the algorithm mode that will
-                be used by the specified function.
+                On exit, the value is overwritten by the algorithm mode used
+                by the specified function.
     *************************************************************************/
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_get_alg_mode(rocblas_handle handle,
@@ -3871,7 +3871,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zunmtr(rocblas_handle handle,
     rocblas_handle.
 
     \note
-    A hybrid (CPU+GPU) approach is available for BDSQR. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE"
+    A hybrid (CPU+GPU) approach is available for BDSQR. Use \ref rocsolver_set_alg_mode
     to enable it.
 
     @param[in]
@@ -12613,7 +12613,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zpotri_strided_batched(rocblas_handle
     within the rocblas_handle.
 
     \note
-    A hybrid (CPU+GPU) approach is available for GESVD. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE"
+    A hybrid (CPU+GPU) approach is available for GESVD. Use \ref rocsolver_set_alg_mode
     to enable it.
 
     @param[in]
@@ -12794,7 +12794,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd(rocblas_handle handle,
     within the rocblas_handle.
 
     \note
-    A hybrid (CPU+GPU) approach is available for GESVD_BATCHED. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE"
+    A hybrid (CPU+GPU) approach is available for GESVD_BATCHED. Use \ref rocsolver_set_alg_mode
     to enable it.
 
     @param[in]
@@ -13016,7 +13016,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd_batched(rocblas_handle handle,
     within the rocblas_handle.
 
     \note
-    A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE"
+    A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED. Use \ref rocsolver_set_alg_mode
     to enable it.
 
     @param[in]
diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
index 797d40140..180a4fced 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
@@ -5,7 +5,7 @@
  *     Univ. of Tennessee, Univ. of California Berkeley,
  *     Univ. of Colorado Denver and NAG Ltd..
  *     June 2017
- * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions

From 4c6e9e013c3aaabc488f3627cb0fc8936eace6e2 Mon Sep 17 00:00:00 2001
From: Juan Zuniga-Anaya <50754207+jzuniga-amd@users.noreply.github.com>
Date: Thu, 14 Nov 2024 22:48:58 -0700
Subject: [PATCH 31/35] code cleaning and rearrangement (#16)

* review corrections/suggestions 1
* code cleaning and rearrangement
* address review comments
---
 library/src/auxiliary/rocauxiliary_bdsqr.hpp  |   16 +-
 .../auxiliary/rocauxiliary_bdsqr_hybrid.hpp   | 1409 +++--------------
 .../src/include/lapack_device_functions.hpp   |   92 ++
 library/src/include/lapack_host_functions.hpp |  596 +++++++
 library/src/include/lib_device_helpers.hpp    |   49 +
 library/src/include/lib_host_helpers.hpp      |   76 +
 6 files changed, 1032 insertions(+), 1206 deletions(-)
 create mode 100644 library/src/include/lapack_host_functions.hpp

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
index 89397dbc8..c19ae4f9b 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
@@ -1254,15 +1254,15 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle,
     ROCSOLVER_LAUNCH_KERNEL((bdsqr_init<T>), gridBasic, threadsBasic, 0, stream, n, D, strideD, E,
                             strideE, info, maxiter, sfm, tol, splits_map, work, strideW, completed);
 
-    if(alg_mode == rocsolver_alg_mode_hybrid)
+    if(n > 1)
     {
-        ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template<T, S, W1, W2, W3, rocblas_int>(
-            handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U, shiftU,
-            ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work));
-    }
-    else
-    {
-        if(n > 1)
+        if(alg_mode == rocsolver_alg_mode_hybrid)
+        {
+            ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template<T, S, W1, W2, W3, rocblas_int>(
+                handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U,
+                shiftU, ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work));
+        }
+        else
         {
             // rotate to upper bidiagonal if necessary
             if(uplo == rocblas_fill_lower)
diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
index 180a4fced..56870cb44 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
@@ -33,803 +33,11 @@
 
 #pragma once
 
-#include <assert.h>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <stdio.h>
-
+#include "lapack_host_functions.hpp"
 #include "rocauxiliary_lasr.hpp"
 
-#include "hip/hip_runtime.h"
-#include "hip/hip_runtime_api.h"
-
 ROCSOLVER_BEGIN_NAMESPACE
 
-#ifndef CHECK_HIP
-#define CHECK_HIP(fcn)                  \
-    {                                   \
-        hipError_t const istat = (fcn); \
-        assert(istat == hipSuccess);    \
-    }
-#endif
-
-template <typename S, typename T, typename I>
-__global__ static void
-    rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s)
-{
-    if(n <= 0)
-        return;
-
-    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-    I const i_inc = hipBlockDim_x * hipGridDim_x;
-
-    if((incx == 1) && (incy == 1))
-    {
-        // ------------
-        // special case
-        // ------------
-        for(I i = i_start; i < n; i += i_inc)
-        {
-            auto const temp = c * x[i] + s * y[i];
-            y[i] = c * y[i] - s * x[i];
-            x[i] = temp;
-        }
-    }
-    else
-    {
-        // ---------------------------
-        // code for unequal increments
-        // ---------------------------
-
-        for(auto i = i_start; i < n; i += i_inc)
-        {
-            auto const ix = 0 + i * static_cast<int64_t>(incx);
-            auto const iy = 0 + i * static_cast<int64_t>(incy);
-            auto const temp = c * x[ix] + s * y[iy];
-            y[iy] = c * y[iy] - s * x[ix];
-            x[ix] = temp;
-        }
-    }
-}
-
-template <typename S, typename T, typename I>
-static void
-    rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream)
-{
-    auto nthreads = warpSize * 2;
-    auto nblocks = (n - 1) / nthreads + 1;
-
-    hipLaunchKernelGGL((rot_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
-                       n, x, incx, y, incy, c, s);
-}
-
-template <typename S, typename T, typename I>
-__global__ static void scal_kernel(I const n, S const da, T* const x, I const incx)
-{
-    if(n <= 0)
-        return;
-
-    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-    I const i_inc = hipBlockDim_x * hipGridDim_x;
-
-    S const zero = 0;
-    bool const is_da_zero = (da == zero);
-    if(incx == 1)
-    {
-        for(I i = i_start; i < n; i += i_inc)
-        {
-            x[i] = (is_da_zero) ? zero : da * x[i];
-        }
-    }
-    else
-    {
-        // ---------------------------
-        // code for non-unit increments
-        // ---------------------------
-
-        for(I i = i_start; i < n; i += i_inc)
-        {
-            auto const ix = 0 + i * static_cast<int64_t>(incx);
-            x[ix] = (is_da_zero) ? zero : da * x[ix];
-        }
-    }
-}
-
-template <typename S, typename T, typename I>
-static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream)
-{
-    auto nthreads = warpSize * 2;
-    auto nblocks = (n - 1) / nthreads + 1;
-
-    hipLaunchKernelGGL((scal_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
-                       n, da, x, incx);
-}
-
-template <typename S, typename T, typename I>
-__global__ static void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy)
-{
-    if(n <= 0)
-        return;
-
-    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
-    I const i_inc = hipBlockDim_x * hipGridDim_x;
-
-    if((incx == 1) && (incy == 1))
-    {
-        // ------------
-        // special case
-        // ------------
-        for(I i = i_start; i < n; i += i_inc)
-        {
-            auto const temp = y[i];
-            y[i] = x[i];
-            x[i] = temp;
-        }
-    }
-    else
-    {
-        // ---------------------------
-        // code for unequal increments
-        // ---------------------------
-
-        for(I i = i_start; i < n; i += i_inc)
-        {
-            auto const ix = 0 + i * static_cast<int64_t>(incx);
-            auto const iy = 0 + i * static_cast<int64_t>(incy);
-
-            auto const temp = y[iy];
-            y[iy] = x[ix];
-            x[ix] = temp;
-        }
-    }
-}
-
-template <typename S, typename T, typename I>
-static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream)
-{
-    auto nthreads = warpSize * 2;
-    auto nblocks = (n - 1) / nthreads + 1;
-
-    hipLaunchKernelGGL((swap_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
-                       n, x, incx, y, incy);
-}
-
-static void call_lamch(char& cmach, double& eps)
-{
-    eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<double>::epsilon()
-        : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<double>::min()
-        : ((cmach == 'B') || (cmach == 's')) ? FLT_RADIX
-                                             : std::numeric_limits<double>::min();
-}
-
-static void call_lamch(char& cmach, float& eps)
-{
-    eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<float>::epsilon()
-        : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<float>::min()
-        : ((cmach == 'B') || (cmach == 's')) ? FLT_RADIX
-                                             : std::numeric_limits<float>::min();
-}
-
-template <typename T, typename I>
-static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy)
-{
-    T* const x = &(x_in);
-    T* const y = &(y_in);
-    for(I i = 0; i < n; i++)
-    {
-        I const ix = i * incx;
-        I const iy = i * incy;
-
-        T const temp = x[ix];
-        x[ix] = y[iy];
-        y[iy] = temp;
-    }
-}
-
-template <typename T>
-static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax)
-{
-    T const zero = 0;
-    T const one = 1;
-    T const two = 2;
-
-    T as, at, au, c, fa, fhmn, fhmx, ga, ha;
-
-    auto abs = [](auto x) { return (std::abs(x)); };
-    auto min = [](auto x, auto y) { return ((x < y) ? x : y); };
-    auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
-    auto sqrt = [](auto x) { return (std::sqrt(x)); };
-    auto square = [](auto x) { return (x * x); };
-
-    fa = abs(f);
-    ga = abs(g);
-    ha = abs(h);
-    fhmn = min(fa, ha);
-    fhmx = max(fa, ha);
-    if(fhmn == zero)
-    {
-        ssmin = zero;
-        if(fhmx == zero)
-        {
-            ssmax = ga;
-        }
-        else
-        {
-            // ssmax = max( fhmx, ga )*sqrt( one+ ( min( fhmx, ga ) / max( fhmx, ga ) )**2 );
-            ssmax = max(fhmx, ga) * sqrt(one + square(min(fhmx, ga) / max(fhmx, ga)));
-        }
-    }
-    else
-    {
-        if(ga < fhmx)
-        {
-            as = one + fhmn / fhmx;
-            at = (fhmx - fhmn) / fhmx;
-            au = square(ga / fhmx);
-            c = two / (sqrt(as * as + au) + sqrt(at * at + au));
-            ssmin = fhmn * c;
-            ssmax = fhmx / c;
-        }
-        else
-        {
-            au = fhmx / ga;
-            if(au == zero)
-            {
-                //
-                //               avoid possible harmful underflow if exponent range
-                //               asymmetric (true ssmin may not underflow even if
-                //               au underflows)
-                //
-                ssmin = (fhmn * fhmx) / ga;
-                ssmax = ga;
-            }
-            else
-            {
-                as = one + fhmn / fhmx;
-                at = (fhmx - fhmn) / fhmx;
-                // c = one / ( sqrt( one+( as*au )**2 )+ sqrt( one+( at*au )**2 ) );
-                c = one / (sqrt(one + square(as * au)) + sqrt(one + square(at * au)));
-                ssmin = (fhmn * c) * au;
-                ssmin = ssmin + ssmin;
-                ssmax = ga / (c + c);
-            }
-        }
-    }
-}
-
-static float real_part(float z)
-{
-    return (z);
-};
-static float real_part(std::complex<float> z)
-{
-    return (z.real());
-};
-static float real_part(rocblas_complex_num<float> z)
-{
-    return (z.real());
-};
-
-static double real_part(double z)
-{
-    return (z);
-};
-static double real_part(std::complex<double> z)
-{
-    return (z.real());
-};
-static double real_part(rocblas_complex_num<double> z)
-{
-    return (z.real());
-};
-
-static float imag_part(float z)
-{
-    return (0);
-};
-static float imag_part(std::complex<float> z)
-{
-    return (z.imag());
-};
-static float imag_part(rocblas_complex_num<float> z)
-{
-    return (z.imag());
-};
-
-static double imag_part(double z)
-{
-    return (0);
-};
-static double imag_part(std::complex<double> z)
-{
-    return (z.imag());
-};
-static double imag_part(rocblas_complex_num<double> z)
-{
-    return (z.imag());
-};
-
-template <typename T, typename S>
-static void call_lartg(T& f, T& g, S& cs, T& sn, T& r)
-{
-    // ------------------------------------------------------
-    // lartg generates a plane rotation so that
-    // [  cs  sn ] * [ f ] = [ r ]
-    // [ -sn  cs ]   [ g ]   [ 0 ]
-    //
-    // where cs * cs + abs(sn)*abs(sn) == 1
-    // if g == 0, then cs == 1, sn == 0
-    // if f == 0, then cs = 0, sn is chosen so that r is real
-    // ------------------------------------------------------
-
-    auto Not = [](bool x) { return (!x); };
-    auto abs = [](auto x) { return (std::abs(x)); };
-    auto dble = [](auto z) { return (static_cast<double>(real_part(z))); };
-    auto dimag = [](auto z) { return (static_cast<double>(imag_part(z))); };
-    auto log = [](auto x) { return (std::log(x)); };
-    auto sqrt = [](auto x) { return (std::sqrt(x)); };
-    auto max = [](auto x, auto y) { return ((x > y) ? x : y); };
-    auto disnan = [](auto x) -> bool { return (isnan(x)); };
-    auto dcmplx = [](auto x, auto y) -> T {
-        bool constexpr is_complex_type
-            = !(std::is_same<T, float>::value || std::is_same<T, double>::value);
-
-        if constexpr(is_complex_type)
-        {
-            return (T(x, y));
-        }
-        else
-        {
-            return (T(x));
-        };
-    };
-    auto dconjg = [&](auto z) { return (dcmplx(dble(z), -dimag(z))); };
-
-    auto square = [](auto x) { return (x * x); };
-
-    auto abs1 = [&](auto ff) { return (max(abs(dble(ff)), abs(dimag(ff)))); };
-    auto abssq = [&](auto ff) { return (square(dble(ff)) + square(dimag(ff))); };
-
-    // -----------------------------------------
-    // compute  sqrt( x * x + y * y )
-    // without unnecessary overflow or underflow
-    // -----------------------------------------
-    auto dlapy2 = [&](auto x, auto y) {
-        auto const one = 1;
-        auto const zero = 0;
-
-        auto ddlapy2 = x;
-        bool const x_is_nan = disnan(x);
-        bool const y_is_nan = disnan(y);
-        if(x_is_nan)
-            ddlapy2 = x;
-        if(y_is_nan)
-            ddlapy2 = y;
-
-        if(Not(x_is_nan || y_is_nan))
-        {
-            auto const xabs = abs(x);
-            auto const yabs = abs(y);
-            auto const w = max(xabs, yabs);
-            auto const z = min(xabs, yabs);
-            if(z == zero)
-            {
-                ddlapy2 = w;
-            }
-            else
-            {
-                ddlapy2 = w * sqrt(one + square(z / w));
-            }
-        }
-        return (ddlapy2);
-    };
-
-    char cmach = 'E';
-    S const zero = 0;
-    S const one = 1;
-    S const two = 2;
-    T const czero = 0;
-
-    bool has_work;
-    bool first;
-    int count, i;
-    S d, di, dr, eps, f2, f2s, g2, g2s, safmin;
-    S safmn2, safmx2, scale;
-    T ff, fs, gs;
-
-    // safmin = dlamch( 's' )
-    cmach = 'S';
-    call_lamch(cmach, safmin);
-
-    // eps = dlamch( 'e' )
-    cmach = 'E';
-    call_lamch(cmach, eps);
-
-    // safmn2 = dlamch( 'b' )**int( log( safmin / eps ) / log( dlamch( 'b' ) ) / two )
-    cmach = 'B';
-    S radix = 2;
-    call_lamch(cmach, radix);
-
-    int const npow = (log(safmin / eps) / log(radix) / two);
-    safmn2 = std::pow(radix, npow);
-    safmx2 = one / safmn2;
-    scale = max(abs1(f), abs1(g));
-    fs = f;
-    gs = g;
-    count = 0;
-
-    if(scale >= safmx2)
-    {
-    L10:
-        do
-        {
-            count = count + 1;
-            fs = fs * safmn2;
-            gs = gs * safmn2;
-            scale = scale * safmn2;
-            // if( (scale >= safmx2) &&  (count  <  20) ) go to L10
-            has_work = ((scale >= safmx2) && (count < 20));
-        } while(has_work);
-    }
-    else
-    {
-        if(scale <= safmn2)
-        {
-            if((g == czero) || disnan(abs(g)))
-            {
-                cs = one;
-                sn = czero;
-                r = f;
-                return;
-            }
-        L20:
-            do
-            {
-                count = count - 1;
-                fs = fs * safmx2;
-                gs = gs * safmx2;
-                scale = scale * safmx2;
-                // if( scale <= safmn2 )        goto L20;
-                has_work = (scale <= safmn2);
-            } while(has_work);
-        }
-        f2 = abssq(fs);
-        g2 = abssq(gs);
-        if(f2 <= max(g2, one) * safmin)
-        {
-            //
-            //        this is a rare case: f is very small.
-            //
-            if(f == czero)
-            {
-                cs = zero;
-                r = dlapy2(dble(g), dimag(g));
-                //           do complex/real division explicitly with two real divisions
-                d = dlapy2(dble(gs), dimag(gs));
-                sn = dcmplx(dble(gs) / d, -dimag(gs) / d);
-                return;
-            }
-            f2s = dlapy2(dble(fs), dimag(fs));
-            //        g2 and g2s are accurate
-            //        g2 is at least safmin, and g2s is at least safmn2
-            g2s = sqrt(g2);
-            //        error in cs from underflow in f2s is at most
-            //        unfl / safmn2  <  sqrt(unfl*eps) .lt. eps
-            //        if max(g2,one)=g2,  then f2  <  g2*safmin,
-            //        and so cs  <  sqrt(safmin)
-            //        if max(g2,one)=one,  then f2  <  safmin
-            //        and so cs  <  sqrt(safmin)/safmn2 = sqrt(eps)
-            //        therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s
-            cs = f2s / g2s;
-            //        make sure abs(ff) = 1
-            //        do complex/real division explicitly with 2 real divisions
-            if(abs1(f) > one)
-            {
-                d = dlapy2(dble(f), dimag(f));
-                ff = dcmplx(dble(f) / d, dimag(f) / d);
-            }
-            else
-            {
-                dr = safmx2 * dble(f);
-                di = safmx2 * dimag(f);
-                d = dlapy2(dr, di);
-                ff = dcmplx(dr / d, di / d);
-            }
-            sn = ff * dcmplx(dble(gs) / g2s, -dimag(gs) / g2s);
-            r = cs * f + sn * g;
-        }
-        else
-        {
-            //
-            //        this is the most common case.
-            //        neither f2 nor f2/g2 are less than safmin
-            //        f2s cannot overflow, and it is accurate
-            //
-            f2s = sqrt(one + g2 / f2);
-            //        do the f2s(real)*fs(complex) multiply with two real multiplies
-            r = dcmplx(f2s * dble(fs), f2s * dimag(fs));
-            cs = one / f2s;
-            d = f2 + g2;
-            //        do complex/real division explicitly with two real divisions
-            sn = dcmplx(dble(r) / d, dimag(r) / d);
-            sn = sn * dconjg(gs);
-            if(count != 0)
-            {
-                if(count > 0)
-                {
-                    for(i = 1; i <= count; i++)
-                    {
-                        r = r * safmx2;
-                    };
-                }
-                else
-                {
-                    for(i = 1; i <= -count; i++)
-                    {
-                        r = r * safmn2;
-                    }
-                }
-            }
-        }
-    }
-}
-
-template <typename T, typename S, typename I>
-static void call_scal(I& n, S& a, T& x_in, I& incx)
-{
-    bool const is_zero = (a == 0);
-    T* const x = &x_in;
-    for(I i = 0; i < n; i++)
-    {
-        auto const ip = i * incx;
-        if(is_zero)
-        {
-            x[ip] = 0;
-        }
-        else
-        {
-            x[ip] *= a;
-        }
-    };
-}
-
-template <typename T, typename S, typename I>
-static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s)
-{
-    T* const x = &(x_in);
-    T* const y = &(y_in);
-
-    for(I i = 0; i < n; i++)
-    {
-        auto const ix = i * incx;
-        auto const iy = i * incy;
-
-        auto const temp = c * x[ix] + s * y[iy];
-        y[iy] = c * y[iy] - s * x[ix];
-        x[ix] = temp;
-    }
-}
-
-// --------------------------------------------------------
-// lasv2 computes the singular value decomposition of a 2 x 2
-// triangular matrix
-// [ F G ]
-// [ 0 H ]
-//
-// on return,
-// abs(ssmax) is the larger singular value,
-// abs(ssmin) is the smaller singular value,
-// (csl,snl) and (csr,snr) are the left and right
-// singular vectors for abs(ssmax)
-//
-// [ csl  snl]  [  F  G ]  [ csr   -snr] = [ ssmax   0    ]
-// [-snl  csl]  [  0  H ]  [ snr    csr]   [  0     ssmin ]
-// --------------------------------------------------------
-template <typename T>
-static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& snl, T& csl)
-{
-    T const zero = 0;
-    T const one = 1;
-    T const two = 2;
-    T const four = 4;
-    T const half = one / two;
-
-    bool gasmal;
-    bool swap;
-    int pmax;
-    char cmach;
-
-    T a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m;
-    T mm, r, s, slt, srt, t, temp, tsign, tt;
-    T macheps;
-
-    auto abs = [](auto x) { return (std::abs(x)); };
-    auto sqrt = [](auto x) { return (std::sqrt(x)); };
-    auto sign = [](auto a, auto b) {
-        auto const abs_a = std::abs(a);
-        return ((b >= 0) ? abs_a : -abs_a);
-    };
-
-    ft = f;
-    fa = abs(ft);
-    ht = h;
-    ha = abs(h);
-    //
-    //     pmax points to the maximum absolute element of matrix
-    //       pmax = 1 if f largest in absolute values
-    //       pmax = 2 if g largest in absolute values
-    //       pmax = 3 if h largest in absolute values
-    //
-    pmax = 1;
-    swap = (ha > fa);
-    if(swap)
-    {
-        pmax = 3;
-        temp = ft;
-        ft = ht;
-        ht = temp;
-        temp = fa;
-        fa = ha;
-        ha = temp;
-        //
-        //        now fa >= ha
-        //
-    }
-    gt = g;
-    ga = abs(gt);
-    if(ga == zero)
-    {
-        //
-        //        diagonal matrix
-        //
-        ssmin = ha;
-        ssmax = fa;
-        clt = one;
-        crt = one;
-        slt = zero;
-        srt = zero;
-    }
-    else
-    {
-        gasmal = true;
-        if(ga > fa)
-        {
-            pmax = 2;
-
-            cmach = 'E';
-            call_lamch(cmach, macheps);
-
-            if((fa / ga) < macheps)
-            {
-                //
-                //              case of very large ga
-                //
-                gasmal = false;
-                ssmax = ga;
-                if(ha > one)
-                {
-                    ssmin = fa / (ga / ha);
-                }
-                else
-                {
-                    ssmin = (fa / ga) * ha;
-                }
-                clt = one;
-                slt = ht / gt;
-                srt = one;
-                crt = ft / gt;
-            }
-        }
-        if(gasmal)
-        {
-            //
-            //           normal case
-            //
-            d = fa - ha;
-            if(d == fa)
-            {
-                //
-                //              copes with infinite f or h
-                //
-                l = one;
-            }
-            else
-            {
-                l = d / fa;
-            }
-            //
-            //           note that 0  <=  l <= 1
-            //
-            m = gt / ft;
-            //
-            //           note that abs(m)  <=  1/macheps
-            //
-            t = two - l;
-            //
-            //           note that t >= 1
-            //
-            mm = m * m;
-            tt = t * t;
-            s = sqrt(tt + mm);
-            //
-            //           note that 1  <=  s <= 1 + 1/macheps
-            //
-            if(l == zero)
-            {
-                r = abs(m);
-            }
-            else
-            {
-                r = sqrt(l * l + mm);
-            }
-            //
-            //           note that 0  <=  r .le. 1 + 1/macheps
-            //
-            a = half * (s + r);
-            //
-            //           note that 1  <=  a .le. 1 + abs(m)
-            //
-            ssmin = ha / a;
-            ssmax = fa * a;
-            if(mm == zero)
-            {
-                //
-                //              note that m is very tiny
-                //
-                if(l == zero)
-                {
-                    t = sign(two, ft) * sign(one, gt);
-                }
-                else
-                {
-                    t = gt / sign(d, ft) + m / t;
-                }
-            }
-            else
-            {
-                t = (m / (s + t) + m / (r + l)) * (one + a);
-            }
-            l = sqrt(t * t + four);
-            crt = two / l;
-            srt = t / l;
-            clt = (crt + srt * m) / a;
-            slt = (ht / ft) * srt / a;
-        }
-    }
-    if(swap)
-    {
-        csl = srt;
-        snl = crt;
-        csr = slt;
-        snr = clt;
-    }
-    else
-    {
-        csl = clt;
-        snl = slt;
-        csr = crt;
-        snr = srt;
-    }
-    //
-    //     correct signs of ssmax and ssmin
-    //
-    if(pmax == 1)
-    {
-        tsign = sign(one, csr) * sign(one, csl) * sign(one, f);
-    }
-    if(pmax == 2)
-    {
-        tsign = sign(one, snr) * sign(one, csl) * sign(one, g);
-    }
-    if(pmax == 3)
-    {
-        tsign = sign(one, snr) * sign(one, snl) * sign(one, h);
-    }
-    ssmax = sign(ssmax, tsign);
-    ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h));
-}
-
 template <typename S, typename T, typename I>
 static void call_lasr(rocblas_side& side,
                       rocblas_pivot& pivot,
@@ -845,7 +53,11 @@ static void call_lasr(rocblas_side& side,
     I const i_inc = 1;
 
     lasr_body<T, S, I>(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc);
-};
+}
+
+/************************************************************************************/
+/***************** Main template functions ******************************************/
+/************************************************************************************/
 
 template <typename S, typename T, typename I>
 static void bdsqr_single_template(rocblas_handle handle,
@@ -854,74 +66,22 @@ static void bdsqr_single_template(rocblas_handle handle,
                                   I ncvt,
                                   I nru,
                                   I ncc,
-
                                   S* d_,
                                   S* e_,
-
                                   T* vt_,
                                   I ldvt,
                                   T* u_,
                                   I ldu,
                                   T* c_,
                                   I ldc,
-
                                   S* work_,
                                   I& info,
                                   S* dwork_ = nullptr,
                                   hipStream_t stream = 0)
 {
-    bool const use_gpu = (dwork_ != nullptr);
-
-    // -----------------------------------
-    // Lapack code used O(n^2) algorithm for sorting
-    // Consider turning off this and rely on
-    // bdsqr_sort() to perform sorting
-    // -----------------------------------
-    bool constexpr need_sort = false;
-
-    S const zero = 0;
-    S const one = 1;
-    S negone = -1;
-    S const hndrd = 100;
-    S const hndrth = one / hndrd;
-    S const ten = 10;
-    S const eight = 8;
-    S const meight = -one / eight;
-    I const maxitr = 6;
-    I ione = 1;
-
-    bool const lower = (uplo == 'L') || (uplo == 'l');
-    bool const upper = (uplo == 'U') || (uplo == 'u');
-    /*
-   *     rotate is true if any singular vectors desired, false otherwise
-   */
-    bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0);
-
-    I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0,
-      maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0;
-
-    I const nrc = n; // number of rows in C matrix
-    I const nrvt = n; // number of rows in VT matrix
-    I const ncu = n; // number of columns in U matrix
-
-    S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, h = 0, mu = 0,
-      oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, sinl = 0, sinr = 0, sll = 0,
-      smax = 0, smin = 0, sminl = 0, sminoa = 0, sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0;
-
-    /*     ..
-  *     .. external functions ..
-        logical            lsame
-        double precision   dlamch
-        external           lsame, dlamch
-  *     ..
-  *     .. external subroutines ..
-        external           dlartg, dlas2, dlasq1, dlasr, dlasv2, drot,
-       $                   dscal, dswap, xerbla
-  *     ..
-  *     .. intrinsic functions ..
-        intrinsic          abs, dble, max, min, sign, sqrt
-   */
-
+    // -------------------------------------
+    // Lambda expressions used as helpers
+    // -------------------------------------
     auto call_swap_gpu = [=](I n, T& x, I incx, T& y, I incy) {
         swap_template<S, T, I>(n, &x, incx, &y, incy, stream);
     };
@@ -962,8 +122,6 @@ static void bdsqr_single_template(rocblas_handle handle,
         CHECK_HIP(hipStreamSynchronize(stream));
     };
 
-    auto abs = [](auto x) { return (std::abs(x)); };
-
     auto indx2f = [](auto i, auto j, auto ld) -> int64_t {
         assert((1 <= i) && (i <= ld));
         assert((1 <= j));
@@ -979,7 +137,9 @@ static void bdsqr_single_template(rocblas_handle handle,
         assert((1 <= i) && (i <= (n - 1)));
         return (e_[i - 1]);
     };
+
     auto work = [=](auto i) -> S& { return (work_[i - 1]); };
+
     auto dwork = [=](auto i) -> S& { return (dwork_[i - 1]); };
 
     auto c = [=](auto i, auto j) -> T& {
@@ -1000,73 +160,73 @@ static void bdsqr_single_template(rocblas_handle handle,
         return (vt_[indx2f(i, j, ldvt)]);
     };
 
-    // ---------------------------
-    // emulate Fortran  intrinsics
-    // ---------------------------
     auto sign = [](auto a, auto b) {
         auto const abs_a = std::abs(a);
         return ((b >= 0) ? abs_a : -abs_a);
     };
 
     auto dble = [](auto x) { return (static_cast<double>(x)); };
+    // -------------------------------
 
-    auto max = [](auto a, auto b) { return ((a > b) ? a : b); };
+    // ----------------
+    // Initialization
+    // ----------------
+    bool const use_gpu = (dwork_ != nullptr);
 
-    auto min = [](auto a, auto b) { return ((a < b) ? a : b); };
+    // Lapack code used O(n^2) algorithm for sorting
+    // Consider turning off this and rely on
+    // bdsqr_sort() to perform sorting
+    bool constexpr need_sort = false;
 
-    auto sqrt = [](auto x) { return (std::sqrt(x)); };
+    S const zero = 0;
+    S const one = 1;
+    S negone = -1;
+    S const hndrd = 100;
+    S const hndrth = one / hndrd;
+    S const ten = 10;
+    S const eight = 8;
+    S const meight = -one / eight;
+    I const maxitr = 6;
+    I ione = 1;
 
-    /*     ..
-   *     .. executable statements ..
-   *
-   *     test the input parameters.
-   *
-   */
+    bool const lower = (uplo == 'L') || (uplo == 'l');
+    bool const upper = (uplo == 'U') || (uplo == 'u');
+
+    //rotate is true if any singular vectors desired, false otherwise
+    bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0);
 
-    info = (!upper) && (!lower) ? -1
-        : (n < 0)               ? -2
-        : (ncvt < 0)            ? -3
-        : (nru < 0)             ? -4
-        : (ncc < 0)             ? -5
-        : ((ncvt == 0) && (ldvt < 1)) || ((ncvt > 0) && (ldvt < max(1, n)) ? -9 : (ldu < max(1, nru)))
-        ? -11
-        : ((ncc == 0) && (ldc < 1)) || ((ncc > 0) && (ldc < max(1, n))) ? -13
-                                                                        : 0;
+    I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0,
+      maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0;
 
-    if(info != 0)
-        return;
+    I const nrc = n; // number of rows in C matrix
+    I const nrvt = n; // number of rows in VT matrix
+    I const ncu = n; // number of columns in U matrix
 
-    if(n == 0)
-        return;
+    S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, h = 0, mu = 0,
+      oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, sinl = 0, sinr = 0, sll = 0,
+      smax = 0, smin = 0, sminl = 0, sminoa = 0, sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0;
 
     bool const need_update_singular_vectors = (nru > 0) || (ncc > 0);
     bool constexpr use_lasr_gpu_nocopy = false;
 
-    if(n == 1)
-        goto L160;
-
     nm1 = n - 1;
     nm12 = nm1 + nm1;
     nm13 = nm12 + nm1;
     idir = 0;
-    /*
-   *     get machine constants
-   *
-   */
+
+    //get machine constants
     {
         char cmach_eps = 'E';
         char cmach_unfl = 'S';
         call_lamch(cmach_eps, eps);
         call_lamch(cmach_unfl, unfl);
     }
-    /*
-   *     if matrix lower bidiagonal, rotate to be upper bidiagonal
-   *     by applying givens rotations on the left
-   */
 
+    // -----------------------------------------
+    // rotate to upper bidiagonal if necesarry
+    // -----------------------------------------
     if(lower)
     {
-        // do 10 i = 1, n - 1
         for(i = 1; i <= (n - 1); i++)
         {
             call_lartg(d(i), e(i), cs, sn, r);
@@ -1076,21 +236,15 @@ static void bdsqr_single_template(rocblas_handle handle,
             work(i) = cs;
             work(nm1 + i) = sn;
         }
-    L10:
-
-        //        ----------------------------------
-        //        update singular vectors if desired
-        //        ----------------------------------
 
+        // update singular vectors if desired
         if(use_lasr_gpu_nocopy)
         {
             CHECK_HIP(hipStreamSynchronize(stream));
 
             if(need_update_singular_vectors)
             {
-                // --------------
                 // copy rotations
-                // --------------
                 size_t const nbytes = sizeof(S) * (n - 1);
                 hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
@@ -1158,96 +312,68 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
         }
     }
-    /*
-   *     compute singular values to relative accuracy tol
-   *     (by setting tol to be negative, algorithm will compute
-   *     singular values to absolute accuracy abs(tol)*norm(input matrix))
-   */
 
-    tolmul = max(ten, min(hndrd, pow(eps, meight)));
+    // -------------------------------------------------------------
+    // Compute singular values and vector to relative accuracy tol
+    // -------------------------------------------------------------
+    tolmul = std::max(ten, std::min(hndrd, pow(eps, meight)));
     tol = tolmul * eps;
 
-    /*
-   *     compute approximate maximum, minimum singular values
-   */
-
-    /*
-        smax = zero
-        do 20 i = 1, n
-           smax = max( smax, abs( d( i ) ) )
-     L20:
-        do 30 i = 1, n - 1
-           smax = max( smax, abs( e( i ) ) )
-     L30:
-  */
+    // compute approximate maximum, minimum singular values
     smax = zero;
-    // do 20 i = 1, n
     for(i = 1; i <= n; i++)
     {
-        smax = max(smax, abs(d(i)));
+        smax = std::max(smax, std::abs(d(i)));
     }
 L20:
-    // do 30 i = 1, n - 1
     for(i = 1; i <= (n - 1); i++)
     {
-        smax = max(smax, abs(e(i)));
+        smax = std::max(smax, std::abs(e(i)));
     }
-L30:
 
+    // compute tolerance
+L30:
     sminl = zero;
     if(tol >= zero)
     {
-        /*
-     *        relative accuracy desired
-     */
-
-        sminoa = abs(d(1));
+        // relative accuracy desired
+        sminoa = std::abs(d(1));
         if(sminoa == zero)
             goto L50;
         mu = sminoa;
-        // do 40 i = 2, n
         for(i = 2; i <= n; i++)
         {
-            mu = abs(d(i)) * (mu / (mu + abs(e(i - 1))));
-            sminoa = min(sminoa, mu);
+            mu = std::abs(d(i)) * (mu / (mu + std::abs(e(i - 1))));
+            sminoa = std::min(sminoa, mu);
             if(sminoa == zero)
                 goto L50;
         }
     L40:
     L50:
-
-        sminoa = sminoa / sqrt(dble(n));
-        thresh = max(tol * sminoa, ((unfl * n) * n) * maxitr);
+        sminoa = sminoa / std::sqrt(dble(n));
+        thresh = std::max(tol * sminoa, ((unfl * n) * n) * maxitr);
     }
     else
     {
-        /*
-     *        absolute accuracy desired
-     */
-
-        thresh = max(abs(tol) * smax, ((unfl * n) * n) * maxitr);
+        //absolute accuracy desired
+        thresh = std::max(std::abs(tol) * smax, ((unfl * n) * n) * maxitr);
     }
-    /*
-   *     prepare for main iteration loop for the singular values
-   *     (maxit is the maximum number of passes through the inner
-   *     loop permitted before nonconvergence signalled.)
-   */
+
+    /** prepare for main iteration loop for the singular values
+        (maxit is the maximum number of passes through the inner
+        loop permitted before nonconvergence signalled.) **/
     maxitdivn = maxitr * n;
     iterdivn = 0;
     iter = -1;
     oldll = -1;
     oldm = -1;
-    /*
-   *     m points to last element of unconverged part of matrix
-   */
     m = n;
-    /*
-   *     begin main iteration loop
-   */
+
+    ///////////////////////////
+    /// MAIN ITERATION LOOP ///
+    ///////////////////////////
 L60:
-    /*
-   *     check for convergence or exceeding iteration count
-   */
+    // check for convergence or exceeding iteration count
     if(m <= 1)
         goto L160;
 
@@ -1258,60 +384,52 @@ static void bdsqr_single_template(rocblas_handle handle,
         if(iterdivn >= maxitdivn)
             goto L200;
     }
-    /*
-   *     find diagonal block of matrix to work on
-   */
-    if(tol < zero && abs(d(m)) <= thresh)
+
+    // find diagonal block of matrix to work on
+    if(tol < zero && std::abs(d(m)) <= thresh)
         d(m) = zero;
 
-    smax = abs(d(m));
+    smax = std::abs(d(m));
     smin = smax;
-    // do 70 lll = 1, m - 1
     for(lll = 1; lll <= (m - 1); lll++)
     {
         ll = m - lll;
-        abss = abs(d(ll));
-        abse = abs(e(ll));
+        abss = std::abs(d(ll));
+        abse = std::abs(e(ll));
         if(tol < zero && abss <= thresh)
             d(ll) = zero;
         if(abse <= thresh)
             goto L80;
-        smin = min(smin, abss);
-        smax = max(smax, max(abss, abse));
+        smin = std::min(smin, abss);
+        smax = std::max(smax, std::max(abss, abse));
     }
+
 L70:
     ll = 0;
     goto L90;
+
 L80:
     e(ll) = zero;
-    /*
-   *     matrix splits since e(ll) = 0
-   */
+    // matrix splits since e(ll) = 0
     if(ll == m - 1)
     {
-        /*
-     *        convergence of bottom singular value, return to top of loop
-     */
+        // convergence of bottom singular value, return to top of loop
         m = m - 1;
         goto L60;
     }
+
 L90:
     ll = ll + 1;
-    /*
-   *     e(ll) through e(m-1) are nonzero, e(ll-1) is zero
-   */
+    // e(ll) through e(m-1) are nonzero, e(ll-1) is zero
     if(ll == m - 1)
     {
-        /*
-     *        2 by 2 block, handle separately
-     */
+        // 2 by 2 block, handle separately
         call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl);
         d(m - 1) = sigmx;
         e(m - 1) = zero;
         d(m) = sigmn;
-        /*
-     *        compute singular vectors, if desired
-     */
+
+        // compute singular vectors, if desired
         if(ncvt > 0)
         {
             if(use_gpu)
@@ -1348,37 +466,30 @@ static void bdsqr_single_template(rocblas_handle handle,
         m = m - 2;
         goto L60;
     }
-    /*
-   *     if working on new submatrix, choose shift direction
-   *     (from larger end diagonal element towards smaller)
-   */
+
+    /** if working on new submatrix, choose shift direction
+        (from larger end diagonal element towards smaller) **/
     if(ll > oldm || m < oldll)
     {
-        if(abs(d(ll)) >= abs(d(m)))
+        if(std::abs(d(ll)) >= std::abs(d(m)))
         {
-            /*
-       *           chase bulge from top (big end) to bottom (small end)
-       */
+            // chase bulge from top (big end) to bottom (small end)
             idir = 1;
         }
         else
         {
-            /*
-       *           chase bulge from bottom (big end) to top (small end)
-       */
+            // chase bulge from bottom (big end) to top (small end)
             idir = 2;
         }
     }
-    /*
-   *     apply convergence tests
-   */
+
+    // apply convergence test
     if(idir == 1)
     {
-        /*
-     *        run convergence test in forward direction
-     *        first apply standard test to bottom of matrix
-     */
-        if(abs(e(m - 1)) <= abs(tol) * abs(d(m)) || (tol < zero && abs(e(m - 1)) <= thresh))
+        // run convergence test in forward direction
+        // first apply standard test to bottom of matrix
+        if(std::abs(e(m - 1)) <= std::abs(tol) * std::abs(d(m))
+           || (tol < zero && std::abs(e(m - 1)) <= thresh))
         {
             e(m - 1) = zero;
             goto L60;
@@ -1386,33 +497,28 @@ static void bdsqr_single_template(rocblas_handle handle,
 
         if(tol >= zero)
         {
-            /*
-       *           if relative accuracy desired,
-       *           apply convergence criterion forward
-       */
-            mu = abs(d(ll));
+            // if relative accuracy desired,
+            // apply convergence criterion forward
+            mu = std::abs(d(ll));
             sminl = mu;
-            // do 100 lll = ll, m - 1
             for(lll = ll; lll <= (m - 1); lll++)
             {
-                if(abs(e(lll)) <= tol * mu)
+                if(std::abs(e(lll)) <= tol * mu)
                 {
                     e(lll) = zero;
                     goto L60;
                 }
-                mu = abs(d(lll + 1)) * (mu / (mu + abs(e(lll))));
-                sminl = min(sminl, mu);
+                mu = std::abs(d(lll + 1)) * (mu / (mu + std::abs(e(lll))));
+                sminl = std::min(sminl, mu);
             }
-            // L100:
         }
     }
     else
     {
-        /*
-     *        run convergence test in backward direction
-     *        first apply standard test to top of matrix
-     */
-        if(abs(e(ll)) <= abs(tol) * abs(d(ll)) || (tol < zero && abs(e(ll)) <= thresh))
+        // run convergence test in backward direction
+        // first apply standard test to top of matrix
+        if(std::abs(e(ll)) <= std::abs(tol) * std::abs(d(ll))
+           || (tol < zero && std::abs(e(ll)) <= thresh))
         {
             e(ll) = zero;
             goto L60;
@@ -1420,81 +526,65 @@ static void bdsqr_single_template(rocblas_handle handle,
 
         if(tol >= zero)
         {
-            /*
-       *           if relative accuracy desired,
-       *           apply convergence criterion backward
-       */
-            mu = abs(d(m));
+            // if relative accuracy desired,
+            // apply convergence criterion backward
+            mu = std::abs(d(m));
             sminl = mu;
-            // do 110 lll = m - 1, ll, -1
             for(lll = (m - 1); lll >= ll; lll--)
             {
-                if(abs(e(lll)) <= tol * mu)
+                if(std::abs(e(lll)) <= tol * mu)
                 {
                     e(lll) = zero;
                     goto L60;
                 }
-                mu = abs(d(lll)) * (mu / (mu + abs(e(lll))));
-                sminl = min(sminl, mu);
+                mu = std::abs(d(lll)) * (mu / (mu + std::abs(e(lll))));
+                sminl = std::min(sminl, mu);
             }
-            // L110:
         }
     }
+
+    /** compute shift.  first, test if shifting would ruin relative
+        accuracy, and if so set the shift to zero **/
     oldll = ll;
     oldm = m;
-    /*
-   *     compute shift.  first, test if shifting would ruin relative
-   *     accuracy, and if so set the shift to zero.
-   */
-    if(tol >= zero && n * tol * (sminl / smax) <= max(eps, hndrth * tol))
+    if(tol >= zero && n * tol * (sminl / smax) <= std::max(eps, hndrth * tol))
     {
-        /*
-     *        use a zero shift to avoid loss of relative accuracy
-     */
+        //use a zero shift to avoid loss of relative accuracy
         shift = zero;
     }
     else
     {
-        /*
-     *        compute the shift from 2-by-2 block at end of matrix
-     */
+        // compute the shift from 2-by-2 block at end of matrix
         if(idir == 1)
         {
-            sll = abs(d(ll));
+            sll = std::abs(d(ll));
             call_las2(d(m - 1), e(m - 1), d(m), shift, r);
         }
         else
         {
-            sll = abs(d(m));
+            sll = std::abs(d(m));
             call_las2(d(ll), e(ll), d(ll + 1), shift, r);
         }
-        /*
-     *        test if shift negligible, and if so set to zero
-     */
+        // test if shift negligible, and if so set to zero
         if(sll > zero)
         {
             if((shift / sll) * (shift / sll) < eps)
                 shift = zero;
         }
     }
-    /*
-   *     increment iteration count
-   */
+
+    // increment iteration count
     iter = iter + m - ll;
-    /*
-   *     if shift = 0, do simplified qr iteration
-   */
+
+    // if shift = 0, do simplified qr iteration
     if(shift == zero)
     {
         if(idir == 1)
         {
-            /*
-       *           chase bulge from top to bottom
-       *           save cosines and sines for later singular vector updates
-       */
+            // chase bulge from top to bottom
+            // save cosines and sines for later singular vector updates
             cs = one;
             oldcs = one;
-            // do 120 i = ll, m - 1
             for(i = ll; i <= (m - 1); i++)
             {
                 auto di_cs = d(i) * cs;
@@ -1509,24 +599,20 @@ static void bdsqr_single_template(rocblas_handle handle,
                 work(i - ll + 1 + nm12) = oldcs;
                 work(i - ll + 1 + nm13) = oldsn;
             }
+
         L120:
             h = d(m) * cs;
             d(m) = h * oldcs;
             e(m - 1) = h * oldsn;
-            //
-            //   -----------------------
-            //   update singular vectors
-            //   -----------------------
 
+            //   update singular vectors
             if(use_lasr_gpu_nocopy)
             {
                 CHECK_HIP(hipStreamSynchronize(stream));
 
                 if(rotate)
                 {
-                    // --------------
                     // copy rotations
-                    // --------------
                     size_t const nbytes = sizeof(S) * (n - 1);
                     hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
@@ -1566,8 +652,7 @@ static void bdsqr_single_template(rocblas_handle handle,
 
             if(ncvt > 0)
             {
-                // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
-                // ll, 1 ), ldvt )
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(ll, 1 ), ldvt )
                 rocblas_side side = rocblas_side_left;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_forward_direction;
@@ -1592,8 +677,7 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
             if(nru > 0)
             {
-                // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
-                // ), u( 1, ll ), ldu )
+                // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 ), u( 1, ll ), ldu )
                 rocblas_side side = rocblas_side_right;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_forward_direction;
@@ -1619,8 +703,7 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
             if(ncc > 0)
             {
-                // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
-                // ), c( ll, 1 ), ldc )
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 ), c( ll, 1 ), ldc )
                 rocblas_side side = rocblas_side_left;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_forward_direction;
@@ -1644,21 +727,17 @@ static void bdsqr_single_template(rocblas_handle handle,
                               c(ll, 1), ldc);
                 }
             }
-            //
-            //           test convergence
-            //
-            if(abs(e(m - 1)) <= thresh)
+
+            // test convergence
+            if(std::abs(e(m - 1)) <= thresh)
                 e(m - 1) = zero;
         }
         else
         {
-            /*
-       *           chase bulge from bottom to top
-       *           save cosines and sines for later singular vector updates
-       */
+            // chase bulge from bottom to top
+            // save cosines and sines for later singular vector updates
             cs = one;
             oldcs = one;
-            // do 130 i = m, ll + 1, -1
             for(i = m; i >= (ll + 1); i--)
             {
                 auto di_cs = d(i) * cs;
@@ -1676,23 +755,20 @@ static void bdsqr_single_template(rocblas_handle handle,
                 work(i - ll + nm12) = oldcs;
                 work(i - ll + nm13) = -oldsn;
             }
+
         L130:
             h = d(ll) * cs;
             d(ll) = h * oldcs;
             e(ll) = h * oldsn;
-            //
-            //           update singular vectors
-            //
 
+            // update singular vectors
             if(use_lasr_gpu_nocopy)
             {
                 CHECK_HIP(hipStreamSynchronize(stream));
 
                 if(rotate)
                 {
-                    // --------------
                     // copy rotations
-                    // --------------
                     size_t const nbytes = sizeof(S) * (n - 1);
                     hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
@@ -1732,9 +808,7 @@ static void bdsqr_single_template(rocblas_handle handle,
 
             if(ncvt > 0)
             {
-                // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
-                // nm13+1
-                // ), vt( ll, 1 ), ldvt );
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( nm13+1 ), vt( ll, 1 ), ldvt );
                 rocblas_side side = rocblas_side_left;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_backward_direction;
@@ -1760,9 +834,7 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
             if(nru > 0)
             {
-                // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
-                // ll
-                // ), ldu )
+                // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, ll ), ldu )
                 rocblas_side side = rocblas_side_right;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_backward_direction;
@@ -1787,9 +859,7 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
             if(ncc > 0)
             {
-                // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
-                // 1
-                // ), ldc )
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, 1 ), ldc )
                 rocblas_side side = rocblas_side_left;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_backward_direction;
@@ -1812,27 +882,22 @@ static void bdsqr_single_template(rocblas_handle handle,
                     call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc);
                 }
             }
-            //
-            //           test convergence
-            //
-            if(abs(e(ll)) <= thresh)
+
+            // test convergence
+            if(std::abs(e(ll)) <= thresh)
                 e(ll) = zero;
         }
     }
+
+    // otherwise use nonzero shift
     else
     {
-        //
-        //        use nonzero shift
-        //
         if(idir == 1)
         {
-            //
-            //           chase bulge from top to bottom
-            //           save cosines and sines for later singular vector updates
-            //
-            f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll));
+            // chase bulge from top to bottom
+            // save cosines and sines for later singular vector updates
+            f = (std::abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll));
             g = e(ll);
-            // do 140 i = ll, m - 1
             for(i = ll; i <= (m - 1); i++)
             {
                 call_lartg(f, g, cosr, sinr, r);
@@ -1856,21 +921,18 @@ static void bdsqr_single_template(rocblas_handle handle,
                 work(i - ll + 1 + nm12) = cosl;
                 work(i - ll + 1 + nm13) = sinl;
             }
+
         L140:
             e(m - 1) = f;
-            //
-            //           update singular vectors
-            //
 
+            // update singular vectors
             if(use_lasr_gpu_nocopy)
             {
                 CHECK_HIP(hipStreamSynchronize(stream));
 
                 if(rotate)
                 {
-                    // --------------
                     // copy rotations
-                    // --------------
                     size_t const nbytes = sizeof(S) * (n - 1);
                     hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
@@ -1910,8 +972,7 @@ static void bdsqr_single_template(rocblas_handle handle,
 
             if(ncvt > 0)
             {
-                // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(
-                // ll, 1 ), ldvt )
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(ll, 1 ), ldvt )
                 rocblas_side side = rocblas_side_left;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_forward_direction;
@@ -1937,8 +998,7 @@ static void bdsqr_single_template(rocblas_handle handle,
 
             if(nru > 0)
             {
-                // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1
-                // ), u( 1, ll ), ldu )
+                // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 ), u( 1, ll ), ldu )
                 rocblas_side side = rocblas_side_right;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_forward_direction;
@@ -1964,8 +1024,7 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
             if(ncc > 0)
             {
-                // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1
-                // ), c( ll, 1 ), ldc )
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 ), c( ll, 1 ), ldc )
                 rocblas_side side = rocblas_side_left;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_forward_direction;
@@ -1989,21 +1048,17 @@ static void bdsqr_single_template(rocblas_handle handle,
                               c(ll, 1), ldc);
                 }
             }
-            /*
-       *           test convergence
-       */
-            if(abs(e(m - 1)) <= thresh)
+
+            // test convergence
+            if(std::abs(e(m - 1)) <= thresh)
                 e(m - 1) = zero;
         }
         else
         {
-            /*
-       *           chase bulge from bottom to top
-       *           save cosines and sines for later singular vector updates
-       */
-            f = (abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m));
+            // chase bulge from bottom to top
+            // save cosines and sines for later singular vector updates
+            f = (std::abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m));
             g = e(m - 1);
-            // do 150 i = m, ll + 1, -1
             for(i = m; i >= (ll + 1); i--)
             {
                 call_lartg(f, g, cosr, sinr, r);
@@ -2027,26 +1082,22 @@ static void bdsqr_single_template(rocblas_handle handle,
                 work(i - ll + nm12) = cosl;
                 work(i - ll + nm13) = -sinl;
             }
+
         L150:
             e(ll) = f;
-            //
-            //           test convergence
-            //
-            if(abs(e(ll)) <= thresh)
+
+            // test convergence
+            if(std::abs(e(ll)) <= thresh)
                 e(ll) = zero;
-            //
-            //           update singular vectors if desired
-            //
 
+            // update singular vectors
             if(use_lasr_gpu_nocopy)
             {
                 CHECK_HIP(hipStreamSynchronize(stream));
 
                 if(rotate)
                 {
-                    // --------------
                     // copy rotations
-                    // --------------
                     size_t const nbytes = sizeof(S) * (n - 1);
                     hipMemcpyKind const kind = hipMemcpyHostToDevice;
 
@@ -2086,9 +1137,7 @@ static void bdsqr_single_template(rocblas_handle handle,
 
             if(ncvt > 0)
             {
-                // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(
-                // nm13+1
-                // ), vt( ll, 1 ), ldvt )
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(nm13+1), vt( ll, 1 ), ldvt )
                 rocblas_side side = rocblas_side_left;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_backward_direction;
@@ -2114,9 +1163,7 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
             if(nru > 0)
             {
-                // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1,
-                // ll
-                // ), ldu )
+                // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, ll ), ldu )
                 rocblas_side side = rocblas_side_right;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_backward_direction;
@@ -2141,9 +1188,7 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
             if(ncc > 0)
             {
-                // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll,
-                // 1
-                // ), ldc )
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, 1 ), ldc )
                 rocblas_side side = rocblas_side_left;
                 rocblas_pivot pivot = rocblas_pivot_variable;
                 rocblas_direct direct = rocblas_backward_direction;
@@ -2169,24 +1214,17 @@ static void bdsqr_single_template(rocblas_handle handle,
         }
     }
     CHECK_HIP(hipStreamSynchronize(stream));
-    /*
-   *     qr iteration finished, go back and check convergence
-   */
+
+    // qr iteration finished, go back and check convergence
     goto L60;
 
-/*
- *     all singular values converged, so make them positive
- */
 L160:
-    // do 170 i = 1, n
+    // all singular values converged, so make them positive
     for(i = 1; i <= n; i++)
     {
         if(d(i) < zero)
         {
             d(i) = -d(i);
-            //
-            //           change sign of singular vectors, if desired
-            //
             if(ncvt > 0)
             {
                 if(use_gpu)
@@ -2200,22 +1238,17 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
         }
     }
+
 L170:
-    //
-    //     sort the singular values into decreasing order (insertion sort on
-    //     singular values, but only one transposition per singular vector)
-    //
-    // do 190 i = 1, n - 1
+    // sort the singular values into decreasing order (insertion sort on
+    // singular values, but only one transposition per singular vector)
     if(need_sort)
     {
         for(i = 1; i <= (n - 1); i++)
         {
-            //
-            //        scan for smallest d(i)
-            //
+            // scan for smallest d(i)
             isub = 1;
             smin = d(1);
-            // do 180 j = 2, n + 1 - i
             for(j = 2; j <= (n + 1 - i); j++)
             {
                 if(d(j) <= smin)
@@ -2227,9 +1260,7 @@ static void bdsqr_single_template(rocblas_handle handle,
         L180:
             if(isub != n + 1 - i)
             {
-                //
-                //           swap singular values and vectors
-                //
+                // swap singular values and vectors
                 d(isub) = d(n + 1 - i);
                 d(n + 1 - i) = smin;
                 if(ncvt > 0)
@@ -2268,25 +1299,22 @@ static void bdsqr_single_template(rocblas_handle handle,
             }
         }
     }
+
 L190:
     goto L220;
-//
-//     maximum number of iterations exceeded, failure to converge
-//
+
+// maximum number of iterations exceeded, failure to converge
 L200:
     info = 0;
-    // do 210 i = 1, n - 1
     for(i = 1; i <= (n - 1); i++)
     {
         if(e(i) != zero)
             info = info + 1;
     }
+
 L210:
 L220:
     return;
-    //
-    //     end of dbdsqr
-    //
 }
 
 template <typename T, typename S, typename W1, typename W2, typename W3, typename I = rocblas_int>
@@ -2317,17 +1345,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
                                                    I* splits_map,
                                                    S* work)
 {
-    // -------------------------
-    // copy D into hD, E into hE
-    // -------------------------
-
-    hipStream_t stream;
-    rocblas_get_stream(handle, &stream);
-
-    W1 V = V_arg;
-    W2 U = U_arg;
-    W3 C = C_arg;
-
+    // -------------------------------
+    // lambda expression as helper
+    // -------------------------------
     auto is_device_pointer = [](void* ptr) -> bool {
         hipPointerAttribute_t dev_attributes;
         if(ptr == nullptr)
@@ -2345,9 +1365,17 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         return (dev_attributes.type == hipMemoryTypeDevice);
     };
 
-    // ---------------------------------------------------
+    // -------------------------
+    // copy D into hD, E into hE
+    // -------------------------
+    hipStream_t stream;
+    rocblas_get_stream(handle, &stream);
+
+    W1 V = V_arg;
+    W2 U = U_arg;
+    W3 C = C_arg;
+
     // handle  batch case with array of pointers on device
-    // ---------------------------------------------------
     std::vector<T*> Vp_array(batch_count);
     std::vector<T*> Up_array(batch_count);
     std::vector<T*> Cp_array(batch_count);
@@ -2357,9 +1385,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         bool const is_device_V_arg = is_device_pointer((void*)V_arg);
         if(is_device_V_arg)
         {
-            // ------------------------------------------------------------
             // note "T *" and "T * const" may be considered different types
-            // ------------------------------------------------------------
             bool constexpr is_array_of_device_pointers
                 = !(std::is_same<W1, T*>::value || std::is_same<W1, T* const>::value);
             bool constexpr need_copy_W1 = is_array_of_device_pointers;
@@ -2442,7 +1468,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     // -------------------------------------------------
     // transfer arrays D(:) and E(:) from Device to Host
     // -------------------------------------------------
-
     bool const use_single_copy_for_D = (batch_count == 1) || (strideD == n);
     if(use_single_copy_for_D)
     {
@@ -2476,6 +1501,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
+    // -------------------------------------------------
+    // Execute for each instance in the batch
+    // -------------------------------------------------
     HIP_CHECK(hipStreamSynchronize(stream));
 
     S* dwork_ = nullptr;
@@ -2486,9 +1514,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         if(linfo_array[bid] != 0)
         {
             continue;
-        };
-
-        // std::vector<S> hwork(4 * n);
+        }
 
         char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U';
         S* d_ = &(hD[bid * n]);
@@ -2498,19 +1524,15 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         T* u_ = (nu > 0) ? load_ptr_batch<T>(U, bid, shiftU, strideU) : nullptr;
         T* c_ = (nc > 0) ? load_ptr_batch<T>(C, bid, shiftC, strideC) : nullptr;
         S* work_ = &(hwork[0]);
-        // S* dwork = &(work[bid * (4 * n)]);
 
         I info = 0;
 
         I nru = nu;
         I ncc = nc;
 
-        // -------------------------------------------------------
         // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V
-        // as input variable
-        // However, rocsolver bdsqr() accepts variable called "V" and "ldv"
-        // but may be  actually holding "VT"
-        // -------------------------------------------------------
+        // as input variable however, rocsolver bdsqr() accepts variable called "V" and "ldv"
+        // but it is  actually holding "VT"
         T* vt_ = v_;
         I ldvt = ldv;
 
@@ -2523,10 +1545,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
 
         if(info == 0)
         {
-            // ----------------------------
             // explicitly zero out "E" array
             // to be compatible with rocsolver bdsqr
-            // ----------------------------
             S const zero = S(0);
             for(I i = 0; i < (n - 1); i++)
             {
@@ -2543,7 +1563,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     // -------------------------------------------------
     // transfer arrays D(:) and E(:) from host to device
     // -------------------------------------------------
-
     if(use_single_copy_for_D)
     {
         void* const src = (void*)&(hD[0]);
@@ -2576,11 +1595,10 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
         }
     }
 
+    // ------------------------------------------------------
+    // copy linfo_array[] from host to info_array[] on device
+    // ------------------------------------------------------
     {
-        // ------------------------------------------------------
-        // copy linfo_array[] from host to info_array[] on device
-        // ------------------------------------------------------
-
         void* const src = (void*)&(linfo_array[0]);
         void* const dst = (void*)&(info_array[0]);
         size_t const nbytes = sizeof(I) * batch_count;
@@ -2594,19 +1612,14 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
     // ----------------------
     // free allocated storage
     // ----------------------
-
     HIP_CHECK(hipHostFree(hwork));
     hwork = nullptr;
-
     HIP_CHECK(hipHostFree(hD));
     hD = nullptr;
-
     HIP_CHECK(hipHostFree(hE));
     hE = nullptr;
-
     HIP_CHECK(hipFree(dwork_));
     dwork_ = nullptr;
-
     HIP_CHECK(hipHostFree(linfo_array));
     linfo_array = nullptr;
 
diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp
index fe0979246..ca388ff11 100644
--- a/library/src/include/lapack_device_functions.hpp
+++ b/library/src/include/lapack_device_functions.hpp
@@ -1111,4 +1111,96 @@ ROCSOLVER_KERNEL void axpy_kernel(const rocblas_int n,
     }
 }
 
+template <typename S, typename T, typename I>
+__global__ static void
+    rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s)
+{
+    if(n <= 0)
+        return;
+
+    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = hipBlockDim_x * hipGridDim_x;
+
+    if((incx == 1) && (incy == 1))
+    {
+        // ------------
+        // special case
+        // ------------
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const temp = c * x[i] + s * y[i];
+            y[i] = c * y[i] - s * x[i];
+            x[i] = temp;
+        }
+    }
+    else
+    {
+        // ---------------------------
+        // code for unequal increments
+        // ---------------------------
+
+        for(auto i = i_start; i < n; i += i_inc)
+        {
+            auto const ix = 0 + i * static_cast<int64_t>(incx);
+            auto const iy = 0 + i * static_cast<int64_t>(incy);
+            auto const temp = c * x[ix] + s * y[iy];
+            y[iy] = c * y[iy] - s * x[ix];
+            x[ix] = temp;
+        }
+    }
+}
+
+template <typename S, typename T, typename I>
+static void
+    rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((rot_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, x, incx, y, incy, c, s);
+}
+
+template <typename S, typename T, typename I>
+__global__ static void scal_kernel(I const n, S const da, T* const x, I const incx)
+{
+    if(n <= 0)
+        return;
+
+    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = hipBlockDim_x * hipGridDim_x;
+
+    S const zero = 0;
+    bool const is_da_zero = (da == zero);
+    if(incx == 1)
+    {
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            x[i] = (is_da_zero) ? zero : da * x[i];
+        }
+    }
+    else
+    {
+        // ---------------------------
+        // code for non-unit increments
+        // ---------------------------
+
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const ix = 0 + i * static_cast<int64_t>(incx);
+            x[ix] = (is_da_zero) ? zero : da * x[ix];
+        }
+    }
+}
+
+template <typename S, typename T, typename I>
+static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((scal_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, da, x, incx);
+}
+
 ROCSOLVER_END_NAMESPACE
diff --git a/library/src/include/lapack_host_functions.hpp b/library/src/include/lapack_host_functions.hpp
new file mode 100644
index 000000000..cb10a233e
--- /dev/null
+++ b/library/src/include/lapack_host_functions.hpp
@@ -0,0 +1,596 @@
+/* **************************************************************************
+ * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include "lib_host_helpers.hpp"
+#include "lib_macros.hpp"
+#include "rocsolver/rocsolver.h"
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+/*
+ * ===========================================================================
+ *    common location for host functions that reproduce LAPACK
+ *    and BLAS functionality.
+ * ===========================================================================
+ */
+
+static void call_lamch(char& cmach, double& eps)
+{
+    eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<double>::epsilon()
+        : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<double>::min()
+        : ((cmach == 'B') || (cmach == 'b')) ? FLT_RADIX
+                                             : std::numeric_limits<double>::min();
+}
+
+static void call_lamch(char& cmach, float& eps)
+{
+    eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<float>::epsilon()
+        : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<float>::min()
+        : ((cmach == 'B') || (cmach == 'b')) ? FLT_RADIX
+                                             : std::numeric_limits<float>::min();
+}
+
+template <typename T>
+static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax)
+{
+    T const zero = 0;
+    T const one = 1;
+    T const two = 2;
+
+    T as, at, au, c, fa, fhmn, fhmx, ga, ha;
+
+    auto square = [](auto x) { return (x * x); };
+
+    fa = std::abs(f);
+    ga = std::abs(g);
+    ha = std::abs(h);
+    fhmn = std::min(fa, ha);
+    fhmx = std::max(fa, ha);
+    if(fhmn == zero)
+    {
+        ssmin = zero;
+        if(fhmx == zero)
+        {
+            ssmax = ga;
+        }
+        else
+        {
+            // ssmax = max( fhmx, ga )*sqrt( one+ ( min( fhmx, ga ) / max( fhmx, ga ) )**2 );
+            ssmax = std::max(fhmx, ga)
+                * std::sqrt(one + square(std::min(fhmx, ga) / std::max(fhmx, ga)));
+        }
+    }
+    else
+    {
+        if(ga < fhmx)
+        {
+            as = one + fhmn / fhmx;
+            at = (fhmx - fhmn) / fhmx;
+            au = square(ga / fhmx);
+            c = two / (std::sqrt(as * as + au) + std::sqrt(at * at + au));
+            ssmin = fhmn * c;
+            ssmax = fhmx / c;
+        }
+        else
+        {
+            au = fhmx / ga;
+            if(au == zero)
+            {
+                //
+                //               avoid possible harmful underflow if exponent range
+                //               asymmetric (true ssmin may not underflow even if
+                //               au underflows)
+                //
+                ssmin = (fhmn * fhmx) / ga;
+                ssmax = ga;
+            }
+            else
+            {
+                as = one + fhmn / fhmx;
+                at = (fhmx - fhmn) / fhmx;
+                // c = one / ( sqrt( one+( as*au )**2 )+ sqrt( one+( at*au )**2 ) );
+                c = one / (std::sqrt(one + square(as * au)) + std::sqrt(one + square(at * au)));
+                ssmin = (fhmn * c) * au;
+                ssmin = ssmin + ssmin;
+                ssmax = ga / (c + c);
+            }
+        }
+    }
+}
+
+template <typename T, typename S>
+static void call_lartg(T& f, T& g, S& cs, T& sn, T& r)
+{
+    // ------------------------------------------------------
+    // lartg generates a plane rotation so that
+    // [  cs  sn ] * [ f ] = [ r ]
+    // [ -sn  cs ]   [ g ]   [ 0 ]
+    //
+    // where cs * cs + abs(sn)*abs(sn) == 1
+    // if g == 0, then cs == 1, sn == 0
+    // if f == 0, then cs = 0, sn is chosen so that r is real
+    // ------------------------------------------------------
+
+    auto dble = [](auto z) { return (static_cast<double>(real_part(z))); };
+    auto dimag = [](auto z) { return (static_cast<double>(imag_part(z))); };
+    auto disnan = [](auto x) -> bool { return (isnan(x)); };
+    auto dcmplx = [](auto x, auto y) -> T {
+        bool constexpr is_complex_type
+            = !(std::is_same<T, float>::value || std::is_same<T, double>::value);
+
+        if constexpr(is_complex_type)
+        {
+            return (T(x, y));
+        }
+        else
+        {
+            return (T(x));
+        };
+    };
+    auto dconjg = [&](auto z) { return (dcmplx(dble(z), -dimag(z))); };
+
+    auto square = [](auto x) { return (x * x); };
+
+    auto abs1 = [&](auto ff) { return (std::max(std::abs(dble(ff)), std::abs(dimag(ff)))); };
+    auto abssq = [&](auto ff) { return (square(dble(ff)) + square(dimag(ff))); };
+
+    // -----------------------------------------
+    // compute  sqrt( x * x + y * y )
+    // without unnecessary overflow or underflow
+    // -----------------------------------------
+    auto dlapy2 = [&](auto x, auto y) {
+        auto const one = 1;
+        auto const zero = 0;
+
+        auto ddlapy2 = x;
+        bool const x_is_nan = disnan(x);
+        bool const y_is_nan = disnan(y);
+        if(x_is_nan)
+            ddlapy2 = x;
+        if(y_is_nan)
+            ddlapy2 = y;
+
+        if(!(x_is_nan || y_is_nan))
+        {
+            auto const xabs = std::abs(x);
+            auto const yabs = std::abs(y);
+            auto const w = std::max(xabs, yabs);
+            auto const z = std::min(xabs, yabs);
+            if(z == zero)
+            {
+                ddlapy2 = w;
+            }
+            else
+            {
+                ddlapy2 = w * std::sqrt(one + square(z / w));
+            }
+        }
+        return (ddlapy2);
+    };
+
+    char cmach = 'E';
+    S const zero = 0;
+    S const one = 1;
+    S const two = 2;
+    T const czero = 0;
+
+    bool has_work;
+    bool first;
+    int count, i;
+    S d, di, dr, eps, f2, f2s, g2, g2s, safmin;
+    S safmn2, safmx2, scale;
+    T ff, fs, gs;
+
+    // safmin = dlamch( 's' )
+    cmach = 'S';
+    call_lamch(cmach, safmin);
+
+    // eps = dlamch( 'e' )
+    cmach = 'E';
+    call_lamch(cmach, eps);
+
+    // safmn2 = dlamch( 'b' )**int( log( safmin / eps ) / log( dlamch( 'b' ) ) / two )
+    cmach = 'B';
+    S radix = 2;
+    call_lamch(cmach, radix);
+
+    int const npow = (std::log(safmin / eps) / std::log(radix) / two);
+    safmn2 = std::pow(radix, npow);
+    safmx2 = one / safmn2;
+    scale = std::max(abs1(f), abs1(g));
+    fs = f;
+    gs = g;
+    count = 0;
+
+    if(scale >= safmx2)
+    {
+        do
+        {
+            count = count + 1;
+            fs = fs * safmn2;
+            gs = gs * safmn2;
+            scale = scale * safmn2;
+            has_work = ((scale >= safmx2) && (count < 20));
+        } while(has_work);
+    }
+    else
+    {
+        if(scale <= safmn2)
+        {
+            if((g == czero) || disnan(std::abs(g)))
+            {
+                cs = one;
+                sn = czero;
+                r = f;
+                return;
+            }
+            do
+            {
+                count = count - 1;
+                fs = fs * safmx2;
+                gs = gs * safmx2;
+                scale = scale * safmx2;
+                has_work = (scale <= safmn2);
+            } while(has_work);
+        }
+        f2 = abssq(fs);
+        g2 = abssq(gs);
+        if(f2 <= std::max(g2, one) * safmin)
+        {
+            //
+            //        this is a rare case: f is very small.
+            //
+            if(f == czero)
+            {
+                cs = zero;
+                r = dlapy2(dble(g), dimag(g));
+                //           do complex/real division explicitly with two real divisions
+                d = dlapy2(dble(gs), dimag(gs));
+                sn = dcmplx(dble(gs) / d, -dimag(gs) / d);
+                return;
+            }
+            f2s = dlapy2(dble(fs), dimag(fs));
+            //        g2 and g2s are accurate
+            //        g2 is at least safmin, and g2s is at least safmn2
+            g2s = std::sqrt(g2);
+            //        error in cs from underflow in f2s is at most
+            //        unfl / safmn2  <  sqrt(unfl*eps) .lt. eps
+            //        if max(g2,one)=g2,  then f2  <  g2*safmin,
+            //        and so cs  <  sqrt(safmin)
+            //        if max(g2,one)=one,  then f2  <  safmin
+            //        and so cs  <  sqrt(safmin)/safmn2 = sqrt(eps)
+            //        therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s
+            cs = f2s / g2s;
+            //        make sure abs(ff) = 1
+            //        do complex/real division explicitly with 2 real divisions
+            if(abs1(f) > one)
+            {
+                d = dlapy2(dble(f), dimag(f));
+                ff = dcmplx(dble(f) / d, dimag(f) / d);
+            }
+            else
+            {
+                dr = safmx2 * dble(f);
+                di = safmx2 * dimag(f);
+                d = dlapy2(dr, di);
+                ff = dcmplx(dr / d, di / d);
+            }
+            sn = ff * dcmplx(dble(gs) / g2s, -dimag(gs) / g2s);
+            r = cs * f + sn * g;
+        }
+        else
+        {
+            //
+            //        this is the most common case.
+            //        neither f2 nor f2/g2 are less than safmin
+            //        f2s cannot overflow, and it is accurate
+            //
+            f2s = std::sqrt(one + g2 / f2);
+            //        do the f2s(real)*fs(complex) multiply with two real multiplies
+            r = dcmplx(f2s * dble(fs), f2s * dimag(fs));
+            cs = one / f2s;
+            d = f2 + g2;
+            //        do complex/real division explicitly with two real divisions
+            sn = dcmplx(dble(r) / d, dimag(r) / d);
+            sn = sn * dconjg(gs);
+            if(count != 0)
+            {
+                if(count > 0)
+                {
+                    for(i = 1; i <= count; i++)
+                    {
+                        r = r * safmx2;
+                    };
+                }
+                else
+                {
+                    for(i = 1; i <= -count; i++)
+                    {
+                        r = r * safmn2;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename T, typename S, typename I>
+static void call_scal(I& n, S& a, T& x_in, I& incx)
+{
+    bool const is_zero = (a == 0);
+    T* const x = &x_in;
+    for(I i = 0; i < n; i++)
+    {
+        auto const ip = i * incx;
+        if(is_zero)
+        {
+            x[ip] = 0;
+        }
+        else
+        {
+            x[ip] *= a;
+        }
+    };
+}
+
+template <typename T, typename S, typename I>
+static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s)
+{
+    T* const x = &(x_in);
+    T* const y = &(y_in);
+
+    for(I i = 0; i < n; i++)
+    {
+        auto const ix = i * incx;
+        auto const iy = i * incy;
+
+        auto const temp = c * x[ix] + s * y[iy];
+        y[iy] = c * y[iy] - s * x[ix];
+        x[ix] = temp;
+    }
+}
+
+// --------------------------------------------------------
+// lasv2 computes the singular value decomposition of a 2 x 2
+// triangular matrix
+// [ F G ]
+// [ 0 H ]
+//
+// on return,
+// abs(ssmax) is the larger singular value,
+// abs(ssmin) is the smaller singular value,
+// (csl,snl) and (csr,snr) are the left and right
+// singular vectors for abs(ssmax)
+//
+// [ csl  snl]  [  F  G ]  [ csr   -snr] = [ ssmax   0    ]
+// [-snl  csl]  [  0  H ]  [ snr    csr]   [  0     ssmin ]
+// --------------------------------------------------------
+template <typename T>
+static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& snl, T& csl)
+{
+    T const zero = 0;
+    T const one = 1;
+    T const two = 2;
+    T const four = 4;
+    T const half = one / two;
+
+    bool gasmal;
+    bool swap;
+    int pmax;
+    char cmach;
+
+    T a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m;
+    T mm, r, s, slt, srt, t, temp, tsign, tt;
+    T macheps;
+
+    auto sign = [](auto a, auto b) {
+        auto const abs_a = std::abs(a);
+        return ((b >= 0) ? abs_a : -abs_a);
+    };
+
+    ft = f;
+    fa = std::abs(ft);
+    ht = h;
+    ha = std::abs(h);
+    //
+    //     pmax points to the maximum absolute element of matrix
+    //       pmax = 1 if f largest in absolute values
+    //       pmax = 2 if g largest in absolute values
+    //       pmax = 3 if h largest in absolute values
+    //
+    pmax = 1;
+    swap = (ha > fa);
+    if(swap)
+    {
+        pmax = 3;
+        temp = ft;
+        ft = ht;
+        ht = temp;
+        temp = fa;
+        fa = ha;
+        ha = temp;
+        //
+        //        now fa >= ha
+        //
+    }
+    gt = g;
+    ga = std::abs(gt);
+    if(ga == zero)
+    {
+        //
+        //        diagonal matrix
+        //
+        ssmin = ha;
+        ssmax = fa;
+        clt = one;
+        crt = one;
+        slt = zero;
+        srt = zero;
+    }
+    else
+    {
+        gasmal = true;
+        if(ga > fa)
+        {
+            pmax = 2;
+
+            cmach = 'E';
+            call_lamch(cmach, macheps);
+
+            if((fa / ga) < macheps)
+            {
+                //
+                //              case of very large ga
+                //
+                gasmal = false;
+                ssmax = ga;
+                if(ha > one)
+                {
+                    ssmin = fa / (ga / ha);
+                }
+                else
+                {
+                    ssmin = (fa / ga) * ha;
+                }
+                clt = one;
+                slt = ht / gt;
+                srt = one;
+                crt = ft / gt;
+            }
+        }
+        if(gasmal)
+        {
+            //
+            //           normal case
+            //
+            d = fa - ha;
+            if(d == fa)
+            {
+                //
+                //              copes with infinite f or h
+                //
+                l = one;
+            }
+            else
+            {
+                l = d / fa;
+            }
+            //
+            //           note that 0  <=  l <= 1
+            //
+            m = gt / ft;
+            //
+            //           note that abs(m)  <=  1/macheps
+            //
+            t = two - l;
+            //
+            //           note that t >= 1
+            //
+            mm = m * m;
+            tt = t * t;
+            s = std::sqrt(tt + mm);
+            //
+            //           note that 1  <=  s <= 1 + 1/macheps
+            //
+            if(l == zero)
+            {
+                r = std::abs(m);
+            }
+            else
+            {
+                r = std::sqrt(l * l + mm);
+            }
+            //
+            //           note that 0  <=  r .le. 1 + 1/macheps
+            //
+            a = half * (s + r);
+            //
+            //           note that 1  <=  a .le. 1 + abs(m)
+            //
+            ssmin = ha / a;
+            ssmax = fa * a;
+            if(mm == zero)
+            {
+                //
+                //              note that m is very tiny
+                //
+                if(l == zero)
+                {
+                    t = sign(two, ft) * sign(one, gt);
+                }
+                else
+                {
+                    t = gt / sign(d, ft) + m / t;
+                }
+            }
+            else
+            {
+                t = (m / (s + t) + m / (r + l)) * (one + a);
+            }
+            l = std::sqrt(t * t + four);
+            crt = two / l;
+            srt = t / l;
+            clt = (crt + srt * m) / a;
+            slt = (ht / ft) * srt / a;
+        }
+    }
+    if(swap)
+    {
+        csl = srt;
+        snl = crt;
+        csr = slt;
+        snr = clt;
+    }
+    else
+    {
+        csl = clt;
+        snl = slt;
+        csr = crt;
+        snr = srt;
+    }
+    //
+    //     correct signs of ssmax and ssmin
+    //
+    if(pmax == 1)
+    {
+        tsign = sign(one, csr) * sign(one, csl) * sign(one, f);
+    }
+    if(pmax == 2)
+    {
+        tsign = sign(one, snr) * sign(one, csl) * sign(one, g);
+    }
+    if(pmax == 3)
+    {
+        tsign = sign(one, snr) * sign(one, snl) * sign(one, h);
+    }
+    ssmax = sign(ssmax, tsign);
+    ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h));
+}
+
+ROCSOLVER_END_NAMESPACE
diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp
index 74f82a15f..609097549 100644
--- a/library/src/include/lib_device_helpers.hpp
+++ b/library/src/include/lib_device_helpers.hpp
@@ -1237,4 +1237,53 @@ __device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev
 #endif
 }
 
+template <typename S, typename T, typename I>
+__global__ static void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy)
+{
+    if(n <= 0)
+        return;
+
+    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = hipBlockDim_x * hipGridDim_x;
+
+    if((incx == 1) && (incy == 1))
+    {
+        // ------------
+        // special case
+        // ------------
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const temp = y[i];
+            y[i] = x[i];
+            x[i] = temp;
+        }
+    }
+    else
+    {
+        // ---------------------------
+        // code for unequal increments
+        // ---------------------------
+
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const ix = 0 + i * static_cast<int64_t>(incx);
+            auto const iy = 0 + i * static_cast<int64_t>(incy);
+
+            auto const temp = y[iy];
+            y[iy] = x[ix];
+            x[ix] = temp;
+        }
+    }
+}
+
+template <typename S, typename T, typename I>
+static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((swap_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, x, incx, y, incy);
+}
+
 ROCSOLVER_END_NAMESPACE
diff --git a/library/src/include/lib_host_helpers.hpp b/library/src/include/lib_host_helpers.hpp
index bca4aae41..a11bf3bc2 100644
--- a/library/src/include/lib_host_helpers.hpp
+++ b/library/src/include/lib_host_helpers.hpp
@@ -92,6 +92,74 @@ I get_index(I* intervals, I max, I dim)
     return i;
 }
 
+template <typename T, typename I>
+static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy)
+{
+    T* const x = &(x_in);
+    T* const y = &(y_in);
+    for(I i = 0; i < n; i++)
+    {
+        I const ix = i * incx;
+        I const iy = i * incy;
+
+        T const temp = x[ix];
+        x[ix] = y[iy];
+        y[iy] = temp;
+    }
+}
+
+static float real_part(float z)
+{
+    return (z);
+};
+static float real_part(std::complex<float> z)
+{
+    return (z.real());
+};
+static float real_part(rocblas_complex_num<float> z)
+{
+    return (z.real());
+};
+
+static double real_part(double z)
+{
+    return (z);
+};
+static double real_part(std::complex<double> z)
+{
+    return (z.real());
+};
+static double real_part(rocblas_complex_num<double> z)
+{
+    return (z.real());
+};
+
+static float imag_part(float z)
+{
+    return (0);
+};
+static float imag_part(std::complex<float> z)
+{
+    return (z.imag());
+};
+static float imag_part(rocblas_complex_num<float> z)
+{
+    return (z.imag());
+};
+
+static double imag_part(double z)
+{
+    return (0);
+};
+static double imag_part(std::complex<double> z)
+{
+    return (z.imag());
+};
+static double imag_part(rocblas_complex_num<double> z)
+{
+    return (z.imag());
+};
+
 #ifdef ROCSOLVER_VERIFY_ASSUMPTIONS
 // Ensure __assert_fail is declared.
 #if !__is_identifier(__assert_fail)
@@ -153,4 +221,12 @@ extern "C" [[noreturn]] void __assert_fail(const char* assertion,
 #define ROCSOLVER_ASSUME_X(invariant, msg) __builtin_assume(invariant)
 #endif
 
+#ifndef CHECK_HIP
+#define CHECK_HIP(fcn)                  \
+    {                                   \
+        hipError_t const istat = (fcn); \
+        assert(istat == hipSuccess);    \
+    }
+#endif
+
 ROCSOLVER_END_NAMESPACE

From ae7688dad135c45487a3f7dedc670406fa5ac257 Mon Sep 17 00:00:00 2001
From: Troy Alderson <58866654+tfalders@users.noreply.github.com>
Date: Fri, 15 Nov 2024 10:56:10 -0700
Subject: [PATCH 32/35] Use ROCSOLVER_KERNEL keyword

---
 library/src/include/lapack_device_functions.hpp | 4 ++--
 library/src/include/lib_device_helpers.hpp      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp
index ca388ff11..693581293 100644
--- a/library/src/include/lapack_device_functions.hpp
+++ b/library/src/include/lapack_device_functions.hpp
@@ -1112,7 +1112,7 @@ ROCSOLVER_KERNEL void axpy_kernel(const rocblas_int n,
 }
 
 template <typename S, typename T, typename I>
-__global__ static void
+ROCSOLVER_KERNEL void
     rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s)
 {
     if(n <= 0)
@@ -1162,7 +1162,7 @@ static void
 }
 
 template <typename S, typename T, typename I>
-__global__ static void scal_kernel(I const n, S const da, T* const x, I const incx)
+ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const incx)
 {
     if(n <= 0)
         return;
diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp
index 609097549..83622b8c6 100644
--- a/library/src/include/lib_device_helpers.hpp
+++ b/library/src/include/lib_device_helpers.hpp
@@ -1238,7 +1238,7 @@ __device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev
 }
 
 template <typename S, typename T, typename I>
-__global__ static void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy)
+ROCSOLVER_KERNEL void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy)
 {
     if(n <= 0)
         return;

From 859bb2f7028660219b72d8630ca24c5fe837a62a Mon Sep 17 00:00:00 2001
From: jzuniga-amd <juan.zuniga-anaya@amd.com>
Date: Fri, 15 Nov 2024 13:26:51 -0700
Subject: [PATCH 33/35] move kernel launchers

---
 .../auxiliary/rocauxiliary_bdsqr_hybrid.hpp   | 37 +++++++++++++++++++
 .../src/include/lapack_device_functions.hpp   | 27 +++-----------
 library/src/include/lib_device_helpers.hpp    | 13 ++-----
 3 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
index 56870cb44..77c86542d 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
@@ -38,6 +38,43 @@
 
 ROCSOLVER_BEGIN_NAMESPACE
 
+/************************************************************************************/
+/***************** Kernel launchers *************************************************/
+/************************************************************************************/
+
+template <typename S, typename T, typename I>
+static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((swap_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, x, incx, y, incy);
+}
+
+template <typename S, typename T, typename I>
+static void
+    rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((rot_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, x, incx, y, incy, c, s);
+}
+
+template <typename S, typename T, typename I>
+static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((scal_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, da, x, incx);
+}
+
+/** Call to lasr functionality.
+    lasr_body can be executed as a host or device function **/
 template <typename S, typename T, typename I>
 static void call_lasr(rocblas_side& side,
                       rocblas_pivot& pivot,
diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp
index 693581293..d10f68fa6 100644
--- a/library/src/include/lapack_device_functions.hpp
+++ b/library/src/include/lapack_device_functions.hpp
@@ -1111,6 +1111,9 @@ ROCSOLVER_KERNEL void axpy_kernel(const rocblas_int n,
     }
 }
 
+/** ROT applies a Givens rotation between to vector x y of dimension n.
+    Launch this kernel with a desired number of threads organized in
+    NG groups in the x direction with NT threads in the x direction. **/
 template <typename S, typename T, typename I>
 ROCSOLVER_KERNEL void
     rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s)
@@ -1150,17 +1153,9 @@ ROCSOLVER_KERNEL void
     }
 }
 
-template <typename S, typename T, typename I>
-static void
-    rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream)
-{
-    auto nthreads = warpSize * 2;
-    auto nblocks = (n - 1) / nthreads + 1;
-
-    hipLaunchKernelGGL((rot_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
-                       n, x, incx, y, incy, c, s);
-}
-
+/** SCAL scales a vector x of dimension n by a factor da.
+    Launch this kernel with a desired number of threads organized in
+    NG groups in the x direction with NT threads in the x direction. **/
 template <typename S, typename T, typename I>
 ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const incx)
 {
@@ -1193,14 +1188,4 @@ ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const inc
     }
 }
 
-template <typename S, typename T, typename I>
-static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream)
-{
-    auto nthreads = warpSize * 2;
-    auto nblocks = (n - 1) / nthreads + 1;
-
-    hipLaunchKernelGGL((scal_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
-                       n, da, x, incx);
-}
-
 ROCSOLVER_END_NAMESPACE
diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp
index 83622b8c6..dc011f03a 100644
--- a/library/src/include/lib_device_helpers.hpp
+++ b/library/src/include/lib_device_helpers.hpp
@@ -1237,6 +1237,9 @@ __device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev
 #endif
 }
 
+/** SWAP swaps the values of vectors x and y of dimension n.
+    Launch this kernel with a desired number of threads organized in
+    NG groups in the x direction with NT threads in the x direction. **/
 template <typename S, typename T, typename I>
 ROCSOLVER_KERNEL void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy)
 {
@@ -1276,14 +1279,4 @@ ROCSOLVER_KERNEL void swap_kernel(I const n, T* const x, I const incx, T* const
     }
 }
 
-template <typename S, typename T, typename I>
-static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream)
-{
-    auto nthreads = warpSize * 2;
-    auto nblocks = (n - 1) / nthreads + 1;
-
-    hipLaunchKernelGGL((swap_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
-                       n, x, incx, y, incy);
-}
-
 ROCSOLVER_END_NAMESPACE

From 30696b173bcd1f2dadb9c54a83a3492723e7b769 Mon Sep 17 00:00:00 2001
From: Troy Alderson <58866654+tfalders@users.noreply.github.com>
Date: Mon, 18 Nov 2024 16:02:49 -0700
Subject: [PATCH 34/35] Addressed review comments

---
 .../include/rocsolver/rocsolver-functions.h   | 16 +++----
 .../src/include/lapack_device_functions.hpp   | 10 ++--
 library/src/include/lapack_host_functions.hpp | 46 ++++++++++---------
 library/src/include/lib_host_helpers.hpp      | 28 +++++------
 4 files changed, 52 insertions(+), 48 deletions(-)

diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h
index 3f340ea70..bd9a0592d 100644
--- a/library/include/rocsolver/rocsolver-functions.h
+++ b/library/include/rocsolver/rocsolver-functions.h
@@ -3871,8 +3871,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zunmtr(rocblas_handle handle,
     rocblas_handle.
 
     \note
-    A hybrid (CPU+GPU) approach is available for BDSQR. Use \ref rocsolver_set_alg_mode
-    to enable it.
+    A hybrid (CPU+GPU) approach is available for BDSQR, primarily intended for homogeneous architectures.
+    Use \ref rocsolver_set_alg_mode to enable it.
 
     @param[in]
     handle      rocblas_handle.
@@ -12613,8 +12613,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zpotri_strided_batched(rocblas_handle
     within the rocblas_handle.
 
     \note
-    A hybrid (CPU+GPU) approach is available for GESVD. Use \ref rocsolver_set_alg_mode
-    to enable it.
+    A hybrid (CPU+GPU) approach is available for GESVD, primarily intended for homogeneous architectures.
+    Use \ref rocsolver_set_alg_mode to enable it.
 
     @param[in]
     handle      rocblas_handle.
@@ -12794,8 +12794,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd(rocblas_handle handle,
     within the rocblas_handle.
 
     \note
-    A hybrid (CPU+GPU) approach is available for GESVD_BATCHED. Use \ref rocsolver_set_alg_mode
-    to enable it.
+    A hybrid (CPU+GPU) approach is available for GESVD_BATCHED, primarily intended for
+    homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it.
 
     @param[in]
     handle      rocblas_handle.
@@ -13016,8 +13016,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd_batched(rocblas_handle handle,
     within the rocblas_handle.
 
     \note
-    A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED. Use \ref rocsolver_set_alg_mode
-    to enable it.
+    A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED, primarily intended
+    for homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it.
 
     @param[in]
     handle      rocblas_handle.
diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp
index d10f68fa6..6e4760c44 100644
--- a/library/src/include/lapack_device_functions.hpp
+++ b/library/src/include/lapack_device_functions.hpp
@@ -1144,8 +1144,8 @@ ROCSOLVER_KERNEL void
 
         for(auto i = i_start; i < n; i += i_inc)
         {
-            auto const ix = 0 + i * static_cast<int64_t>(incx);
-            auto const iy = 0 + i * static_cast<int64_t>(incy);
+            auto const ix = i * static_cast<int64_t>(incx);
+            auto const iy = i * static_cast<int64_t>(incy);
             auto const temp = c * x[ix] + s * y[iy];
             y[iy] = c * y[iy] - s * x[ix];
             x[ix] = temp;
@@ -1171,7 +1171,7 @@ ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const inc
     {
         for(I i = i_start; i < n; i += i_inc)
         {
-            x[i] = (is_da_zero) ? zero : da * x[i];
+            x[i] = da * x[i];
         }
     }
     else
@@ -1182,8 +1182,8 @@ ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const inc
 
         for(I i = i_start; i < n; i += i_inc)
         {
-            auto const ix = 0 + i * static_cast<int64_t>(incx);
-            x[ix] = (is_da_zero) ? zero : da * x[ix];
+            auto const ix = i * static_cast<int64_t>(incx);
+            x[ix] = da * x[ix];
         }
     }
 }
diff --git a/library/src/include/lapack_host_functions.hpp b/library/src/include/lapack_host_functions.hpp
index cb10a233e..c120151ce 100644
--- a/library/src/include/lapack_host_functions.hpp
+++ b/library/src/include/lapack_host_functions.hpp
@@ -40,20 +40,32 @@ ROCSOLVER_BEGIN_NAMESPACE
  * ===========================================================================
  */
 
-static void call_lamch(char& cmach, double& eps)
+static void call_lamch(char& cmach, float& eps)
 {
-    eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<double>::epsilon()
-        : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<double>::min()
-        : ((cmach == 'B') || (cmach == 'b')) ? FLT_RADIX
-                                             : std::numeric_limits<double>::min();
+    switch(cmach)
+    {
+    case 'E':
+    case 'e': eps = std::numeric_limits<float>::epsilon(); return;
+    case 'S':
+    case 's': eps = std::numeric_limits<float>::min(); return;
+    case 'B':
+    case 'b': eps = FLT_RADIX; return;
+    default: eps = std::numeric_limits<float>::min();
+    }
 }
 
-static void call_lamch(char& cmach, float& eps)
+static void call_lamch(char& cmach, double& eps)
 {
-    eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits<float>::epsilon()
-        : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits<float>::min()
-        : ((cmach == 'B') || (cmach == 'b')) ? FLT_RADIX
-                                             : std::numeric_limits<float>::min();
+    switch(cmach)
+    {
+    case 'E':
+    case 'e': eps = std::numeric_limits<double>::epsilon(); return;
+    case 'S':
+    case 's': eps = std::numeric_limits<double>::min(); return;
+    case 'B':
+    case 'b': eps = FLT_RADIX; return;
+    default: eps = std::numeric_limits<double>::min();
+    }
 }
 
 template <typename T>
@@ -141,8 +153,7 @@ static void call_lartg(T& f, T& g, S& cs, T& sn, T& r)
     auto dimag = [](auto z) { return (static_cast<double>(imag_part(z))); };
     auto disnan = [](auto x) -> bool { return (isnan(x)); };
     auto dcmplx = [](auto x, auto y) -> T {
-        bool constexpr is_complex_type
-            = !(std::is_same<T, float>::value || std::is_same<T, double>::value);
+        bool constexpr is_complex_type = rocblas_is_complex<T>;
 
         if constexpr(is_complex_type)
         {
@@ -348,15 +359,8 @@ static void call_scal(I& n, S& a, T& x_in, I& incx)
     for(I i = 0; i < n; i++)
     {
         auto const ip = i * incx;
-        if(is_zero)
-        {
-            x[ip] = 0;
-        }
-        else
-        {
-            x[ip] *= a;
-        }
-    };
+        x[ip] *= a;
+    }
 }
 
 template <typename T, typename S, typename I>
diff --git a/library/src/include/lib_host_helpers.hpp b/library/src/include/lib_host_helpers.hpp
index a11bf3bc2..8dfad5014 100644
--- a/library/src/include/lib_host_helpers.hpp
+++ b/library/src/include/lib_host_helpers.hpp
@@ -99,8 +99,8 @@ static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy)
     T* const y = &(y_in);
     for(I i = 0; i < n; i++)
     {
-        I const ix = i * incx;
-        I const iy = i * incy;
+        auto const ix = i * static_cast<int64_t>(incx);
+        auto const iy = i * static_cast<int64_t>(incy);
 
         T const temp = x[ix];
         x[ix] = y[iy];
@@ -111,54 +111,54 @@ static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy)
 static float real_part(float z)
 {
     return (z);
-};
+}
 static float real_part(std::complex<float> z)
 {
     return (z.real());
-};
+}
 static float real_part(rocblas_complex_num<float> z)
 {
     return (z.real());
-};
+}
 
 static double real_part(double z)
 {
     return (z);
-};
+}
 static double real_part(std::complex<double> z)
 {
     return (z.real());
-};
+}
 static double real_part(rocblas_complex_num<double> z)
 {
     return (z.real());
-};
+}
 
 static float imag_part(float z)
 {
     return (0);
-};
+}
 static float imag_part(std::complex<float> z)
 {
     return (z.imag());
-};
+}
 static float imag_part(rocblas_complex_num<float> z)
 {
     return (z.imag());
-};
+}
 
 static double imag_part(double z)
 {
     return (0);
-};
+}
 static double imag_part(std::complex<double> z)
 {
     return (z.imag());
-};
+}
 static double imag_part(rocblas_complex_num<double> z)
 {
     return (z.imag());
-};
+}
 
 #ifdef ROCSOLVER_VERIFY_ASSUMPTIONS
 // Ensure __assert_fail is declared.

From a08ca4f5c6924c6af263b92312478ae521c74b83 Mon Sep 17 00:00:00 2001
From: Troy Alderson <58866654+tfalders@users.noreply.github.com>
Date: Mon, 18 Nov 2024 16:12:18 -0700
Subject: [PATCH 35/35] Addressed review comment

---
 library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
index 77c86542d..69584cc01 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
@@ -159,12 +159,6 @@ static void bdsqr_single_template(rocblas_handle handle,
         CHECK_HIP(hipStreamSynchronize(stream));
     };
 
-    auto indx2f = [](auto i, auto j, auto ld) -> int64_t {
-        assert((1 <= i) && (i <= ld));
-        assert((1 <= j));
-        return ((i - 1) + (j - 1) * int64_t(ld));
-    };
-
     auto d = [=](auto i) -> S& {
         assert((1 <= i) && (i <= n));
         return (d_[i - 1]);
@@ -182,19 +176,19 @@ static void bdsqr_single_template(rocblas_handle handle,
     auto c = [=](auto i, auto j) -> T& {
         assert((1 <= i) && (i <= nrc) && (nrc <= ldc));
         assert((1 <= j) && (j <= ncc));
-        return (c_[indx2f(i, j, ldc)]);
+        return (c_[idx2D(i - 1, j - 1, ldc)]);
     };
 
     auto u = [=](auto i, auto j) -> T& {
         assert((1 <= i) && (i <= nru) && (nru <= ldu));
         assert((1 <= j) && (j <= ncu));
-        return (u_[indx2f(i, j, ldu)]);
+        return (u_[idx2D(i - 1, j - 1, ldu)]);
     };
 
     auto vt = [=](auto i, auto j) -> T& {
         assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt));
         assert((1 <= j) && (j <= ncvt));
-        return (vt_[indx2f(i, j, ldvt)]);
+        return (vt_[idx2D(i - 1, j - 1, ldvt)]);
     };
 
     auto sign = [](auto a, auto b) {