diff --git a/CHANGELOG.md b/CHANGELOG.md
index 45187e998..c919d5722 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,11 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](h
 
 * Application of a sequence of plane rotations to a given matrix
     - LASR
+* Algorithm selection APIs for hybrid computation
+* SVD of bidiagonal matrices routine:
+    - BDSQR now supports hybrid computation
+* SVD of general matrices routine:
+    - GESVD now supports hybrid computation
 
 ### Changed
 ### Removed
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91a80175c..d8e77749f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,7 +204,7 @@ find_package(hip REQUIRED CONFIG PATHS ${ROCM_PATH} /opt/rocm)
 find_package(rocblas REQUIRED CONFIG PATHS ${ROCM_PATH})
 get_imported_target_location(location roc::rocblas)
 message(STATUS "Found rocBLAS: ${location}")
-set(rocblas_minimum 4.1)
+set(rocblas_minimum 4.4)
 rocm_package_add_dependencies(SHARED_DEPENDS "rocblas >= ${rocblas_minimum}")
 rocm_package_add_rpm_dependencies(STATIC_DEPENDS "rocblas-static-devel >= ${rocblas_minimum}")
 rocm_package_add_deb_dependencies(STATIC_DEPENDS "rocblas-static-dev >= ${rocblas_minimum}")
diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp
index 82669d341..16a244199 100644
--- a/clients/benchmarks/client.cpp
+++ b/clients/benchmarks/client.cpp
@@ -125,6 +125,13 @@ try
             "                           Reported time will be the average.\n"
             "                           ")
 
+        ("alg_mode",
+         value<rocblas_int>(&argus.alg_mode)->default_value(0),
+            "0 = GPU-only, 1 = Hybrid\n"
+            "                           This will change how the algorithm operates.\n"
+            "                           Only applicable to functions with hybrid support.\n"
+            "                           ")
+
         ("mem_query",
          value<rocblas_int>(&argus.mem_query)->default_value(0),
             "Calculate the required amount of device workspace memory? 0 = No, 1 = Yes.\n"
diff --git a/clients/common/auxiliary/testing_bdsqr.hpp b/clients/common/auxiliary/testing_bdsqr.hpp
index 2f2a984f4..058a923b4 100644
--- a/clients/common/auxiliary/testing_bdsqr.hpp
+++ b/clients/common/auxiliary/testing_bdsqr.hpp
@@ -478,6 +478,19 @@ void testing_bdsqr(Arguments& argus)
     rocblas_fill uplo = char2rocblas_fill(uploC);
     rocblas_int hot_calls = argus.iters;
 
+    if(argus.alg_mode)
+    {
+        EXPECT_ROCBLAS_STATUS(
+            rocsolver_set_alg_mode(handle, rocsolver_function_bdsqr, rocsolver_alg_mode_hybrid),
+            rocblas_status_success);
+
+        rocsolver_alg_mode alg_mode;
+        EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_bdsqr, &alg_mode),
+                              rocblas_status_success);
+
+        EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid);
+    }
+
     // check non-supported values
     if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower)
     {
diff --git a/clients/common/lapack/testing_gesvd.hpp b/clients/common/lapack/testing_gesvd.hpp
index 2fbf49dcd..de7fa10d3 100644
--- a/clients/common/lapack/testing_gesvd.hpp
+++ b/clients/common/lapack/testing_gesvd.hpp
@@ -548,6 +548,19 @@ void testing_gesvd(Arguments& argus)
     rocblas_int bc = argus.batch_count;
     rocblas_int hot_calls = argus.iters;
 
+    if(argus.alg_mode)
+    {
+        EXPECT_ROCBLAS_STATUS(
+            rocsolver_set_alg_mode(handle, rocsolver_function_gesvd, rocsolver_alg_mode_hybrid),
+            rocblas_status_success);
+
+        rocsolver_alg_mode alg_mode;
+        EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_gesvd, &alg_mode),
+                              rocblas_status_success);
+
+        EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid);
+    }
+
     // check non-supported values
     if(rightv == rocblas_svect_overwrite && leftv == rocblas_svect_overwrite)
     {
diff --git a/clients/common/misc/rocsolver_arguments.hpp b/clients/common/misc/rocsolver_arguments.hpp
index cfcb887eb..a8ffd9e73 100644
--- a/clients/common/misc/rocsolver_arguments.hpp
+++ b/clients/common/misc/rocsolver_arguments.hpp
@@ -54,6 +54,7 @@ class Arguments : private std::map<std::string, variable_value>
     rocblas_int perf = 0;
     rocblas_int singular = 0;
     rocblas_int iters = 5;
+    rocblas_int alg_mode = 0;
     rocblas_int mem_query = 0;
     rocblas_int profile = 0;
     rocblas_int profile_kernels = 0;
@@ -112,6 +113,7 @@ class Arguments : private std::map<std::string, variable_value>
         to_consume.erase("batch_count");
         to_consume.erase("verify");
         to_consume.erase("iters");
+        to_consume.erase("alg_mode");
         to_consume.erase("mem_query");
         to_consume.erase("profile");
         to_consume.erase("profile_kernels");
diff --git a/clients/gtest/auxiliary/bdsqr_gtest.cpp b/clients/gtest/auxiliary/bdsqr_gtest.cpp
index b963f6029..4e78b0bea 100644
--- a/clients/gtest/auxiliary/bdsqr_gtest.cpp
+++ b/clients/gtest/auxiliary/bdsqr_gtest.cpp
@@ -112,7 +112,8 @@ Arguments bdsqr_setup_arguments(bdsqr_tuple tup)
     return arg;
 }
 
-class BDSQR : public ::TestWithParam<bdsqr_tuple>
+template <rocblas_int MODE>
+class BDSQR_BASE : public ::TestWithParam<bdsqr_tuple>
 {
 protected:
     void TearDown() override
@@ -124,6 +125,7 @@ class BDSQR : public ::TestWithParam<bdsqr_tuple>
     void run_tests()
     {
         Arguments arg = bdsqr_setup_arguments(GetParam());
+        arg.alg_mode = MODE;
 
         if(arg.peek<rocblas_int>("n") == 0 && arg.peek<char>("uplo") == 'L')
             testing_bdsqr_bad_arg<T>();
@@ -132,6 +134,14 @@ class BDSQR : public ::TestWithParam<bdsqr_tuple>
     }
 };
 
+class BDSQR : public BDSQR_BASE<0>
+{
+};
+
+class BDSQR_HYBRID : public BDSQR_BASE<1>
+{
+};
+
 // non-batch tests
 
 TEST_P(BDSQR, __float)
@@ -154,8 +164,36 @@ TEST_P(BDSQR, __double_complex)
     run_tests<rocblas_double_complex>();
 }
 
+TEST_P(BDSQR_HYBRID, __float)
+{
+    run_tests<float>();
+}
+
+TEST_P(BDSQR_HYBRID, __double)
+{
+    run_tests<double>();
+}
+
+TEST_P(BDSQR_HYBRID, __float_complex)
+{
+    run_tests<rocblas_float_complex>();
+}
+
+TEST_P(BDSQR_HYBRID, __double_complex)
+{
+    run_tests<rocblas_double_complex>();
+}
+
 INSTANTIATE_TEST_SUITE_P(daily_lapack,
                          BDSQR,
                          Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range)));
 
 INSTANTIATE_TEST_SUITE_P(checkin_lapack, BDSQR, Combine(ValuesIn(size_range), ValuesIn(opt_range)));
+
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         BDSQR_HYBRID,
+                         Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range)));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack,
+                         BDSQR_HYBRID,
+                         Combine(ValuesIn(size_range), ValuesIn(opt_range)));
diff --git a/clients/gtest/lapack/gesvd_gtest.cpp b/clients/gtest/lapack/gesvd_gtest.cpp
index 51acceda3..3ee770c1d 100644
--- a/clients/gtest/lapack/gesvd_gtest.cpp
+++ b/clients/gtest/lapack/gesvd_gtest.cpp
@@ -158,7 +158,8 @@ Arguments gesvd_setup_arguments(gesvd_tuple tup)
     return arg;
 }
 
-class GESVD : public ::TestWithParam<gesvd_tuple>
+template <rocblas_int MODE>
+class GESVD_BASE : public ::TestWithParam<gesvd_tuple>
 {
 protected:
     void TearDown() override
@@ -170,6 +171,7 @@ class GESVD : public ::TestWithParam<gesvd_tuple>
     void run_tests()
     {
         Arguments arg = gesvd_setup_arguments(GetParam());
+        arg.alg_mode = MODE;
 
         if(arg.peek<rocblas_int>("m") == 0 && arg.peek<rocblas_int>("n") == 0
            && arg.peek<char>("left_svect") == 'N' && arg.peek<char>("right_svect") == 'N')
@@ -180,6 +182,14 @@ class GESVD : public ::TestWithParam<gesvd_tuple>
     }
 };
 
+class GESVD : public GESVD_BASE<0>
+{
+};
+
+class GESVD_HYBRID : public GESVD_BASE<1>
+{
+};
+
 // non-batch tests
 
 TEST_P(GESVD, __float)
@@ -202,6 +212,26 @@ TEST_P(GESVD, __double_complex)
     run_tests<false, false, rocblas_double_complex>();
 }
 
+TEST_P(GESVD_HYBRID, __float)
+{
+    run_tests<false, false, float>();
+}
+
+TEST_P(GESVD_HYBRID, __double)
+{
+    run_tests<false, false, double>();
+}
+
+TEST_P(GESVD_HYBRID, __float_complex)
+{
+    run_tests<false, false, rocblas_float_complex>();
+}
+
+TEST_P(GESVD_HYBRID, __double_complex)
+{
+    run_tests<false, false, rocblas_double_complex>();
+}
+
 // batched tests
 
 TEST_P(GESVD, batched__float)
@@ -224,6 +254,26 @@ TEST_P(GESVD, batched__double_complex)
     run_tests<true, true, rocblas_double_complex>();
 }
 
+TEST_P(GESVD_HYBRID, batched__float)
+{
+    run_tests<true, true, float>();
+}
+
+TEST_P(GESVD_HYBRID, batched__double)
+{
+    run_tests<true, true, double>();
+}
+
+TEST_P(GESVD_HYBRID, batched__float_complex)
+{
+    run_tests<true, true, rocblas_float_complex>();
+}
+
+TEST_P(GESVD_HYBRID, batched__double_complex)
+{
+    run_tests<true, true, rocblas_double_complex>();
+}
+
 // strided_batched tests
 
 TEST_P(GESVD, strided_batched__float)
@@ -246,11 +296,36 @@ TEST_P(GESVD, strided_batched__double_complex)
     run_tests<false, true, rocblas_double_complex>();
 }
 
-// daily_lapack tests normal execution with medium to large sizes
+TEST_P(GESVD_HYBRID, strided_batched__float)
+{
+    run_tests<false, true, float>();
+}
+
+TEST_P(GESVD_HYBRID, strided_batched__double)
+{
+    run_tests<false, true, double>();
+}
+
+TEST_P(GESVD_HYBRID, strided_batched__float_complex)
+{
+    run_tests<false, true, rocblas_float_complex>();
+}
+
+TEST_P(GESVD_HYBRID, strided_batched__double_complex)
+{
+    run_tests<false, true, rocblas_double_complex>();
+}
+
 INSTANTIATE_TEST_SUITE_P(daily_lapack,
                          GESVD,
                          Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range)));
 
-// checkin_lapack tests normal execution with small sizes, invalid sizes,
-// quick returns, and corner cases
 INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESVD, Combine(ValuesIn(size_range), ValuesIn(opt_range)));
+
+INSTANTIATE_TEST_SUITE_P(daily_lapack,
+                         GESVD_HYBRID,
+                         Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range)));
+
+INSTANTIATE_TEST_SUITE_P(checkin_lapack,
+                         GESVD_HYBRID,
+                         Combine(ValuesIn(size_range), ValuesIn(opt_range)));
diff --git a/docs/index.rst b/docs/index.rst
index d7725421c..e1c39a266 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -8,7 +8,7 @@
 rocSOLVER documentation
 ********************************************************************
 
-rocSOLVER is an implementation of `LAPACK routines <https://www.netlib.org/lapack/explore-html/modules.html>`_ 
+rocSOLVER is an implementation of `LAPACK routines <https://www.netlib.org/lapack/explore-html/modules.html>`_
 on top of the :doc:`AMD ROCm platform <rocm:index>`. rocSOLVER is implemented in the
 :doc:`HIP programming language <hip:index>` and optimized for AMD GPUs.
 
@@ -39,7 +39,7 @@ The rocSOLVER documentation is structured as follows:
     * :ref:`lapackfunc`
     * :ref:`lapack-like`
     * :ref:`refactor`
-    * :ref:`api_logging`
+    * :ref:`helpers`
     * :ref:`tuning_label`
     * :ref:`deprecated`
 
diff --git a/docs/reference/auxiliary.rst b/docs/reference/auxiliary.rst
index 83503e7e2..8bebbf116 100644
--- a/docs/reference/auxiliary.rst
+++ b/docs/reference/auxiliary.rst
@@ -58,7 +58,7 @@ rocsolver_<type>lacgv()
 .. doxygenfunction:: rocsolver_zlacgv_64
    :outline:
 .. doxygenfunction:: rocsolver_clacgv_64
-   :outline
+   :outline:
 .. doxygenfunction:: rocsolver_zlacgv
    :outline:
 .. doxygenfunction:: rocsolver_clacgv
diff --git a/docs/reference/logging.rst b/docs/reference/helpers.rst
similarity index 64%
rename from docs/reference/logging.rst
rename to docs/reference/helpers.rst
index 26c02e42f..f8051f03f 100644
--- a/docs/reference/logging.rst
+++ b/docs/reference/helpers.rst
@@ -2,16 +2,63 @@
   :description: rocSOLVER documentation and API reference library
   :keywords: rocSOLVER, ROCm, API, documentation
 
-.. _api_logging:
+.. _helpers:
 
 *****************************************************
-rocSOLVER Logging Functions and Library Information
+rocSOLVER Library and Logging Functions
 *****************************************************
 
-Logging functions
+These are helper functions that retrieve information and control some functions of the library.
+The helper functions are divided into the following categories:
+
+* :ref:`lib_info`. These functions return information about the library version.
+* :ref:`algo_select`. Functions to select different algorithm modes of certain APIs.
+* :ref:`api_logging`. These functions control the :ref:`logging-label` capabilities.
+
+
+
+.. _lib_info:
+
+Library information
+===============================
+
+.. contents:: List of library information functions
+   :local:
+   :backlinks: top
+
+rocsolver_get_version_string()
+------------------------------------
+.. doxygenfunction:: rocsolver_get_version_string
+
+rocsolver_get_version_string_size()
+------------------------------------
+.. doxygenfunction:: rocsolver_get_version_string_size
+
+
+
+.. _algo_select:
+
+Algorithm selection
 ===============================
 
-These functions control rocSOLVER's :ref:`logging-label` capabilities.
+.. contents:: List of algorithm selection functions
+   :local:
+   :backlinks: top
+
+rocsolver_set_alg_mode()
+------------------------------------
+.. doxygenfunction:: rocsolver_set_alg_mode
+
+rocsolver_get_alg_mode()
+------------------------------------
+.. doxygenfunction:: rocsolver_get_alg_mode
+
+
+
+.. _api_logging:
+
+Logging functions
+===============================
 
 .. contents:: List of logging functions
    :local:
@@ -45,22 +92,3 @@ rocsolver_log_flush_profile()
 ---------------------------------
 .. doxygenfunction:: rocsolver_log_flush_profile
 
-
-
-.. _libraryinfo:
-
-Library information
-===============================
-
-.. contents:: List of library information functions
-   :local:
-   :backlinks: top
-
-rocsolver_get_version_string()
-------------------------------------
-.. doxygenfunction:: rocsolver_get_version_string
-
-rocsolver_get_version_string_size()
-------------------------------------
-.. doxygenfunction:: rocsolver_get_version_string_size
-
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index b82b12dc2..18a2fb338 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -8,15 +8,15 @@
 Reference
 ########################################
 
-This section provides technical descriptions and important information about 
-the different rocSOLVER APIs and library components. 
+This section provides technical descriptions and important information about
+the different rocSOLVER APIs and library components.
 
 * :ref:`intro`
 * :ref:`rocsolver-types`
-* :ref:`rocsolver_auxiliary_functions` 
+* :ref:`rocsolver_auxiliary_functions`
 * :ref:`lapackfunc`
 * :ref:`lapack-like`
 * :ref:`refactor`
-* :ref:`api_logging`
+* :ref:`helpers`
 * :ref:`tuning_label`
 * :ref:`deprecated`
diff --git a/docs/reference/lapacklike.rst b/docs/reference/lapacklike.rst
index b6b2823c5..68b6881f3 100644
--- a/docs/reference/lapacklike.rst
+++ b/docs/reference/lapacklike.rst
@@ -547,7 +547,7 @@ rocsolver_<type>syevdx_strided_batched()
 rocsolver_<type>heevdx()
 ---------------------------------------------------
 .. doxygenfunction:: rocsolver_zheevdx
-   :outline
+   :outline:
 .. doxygenfunction:: rocsolver_cheevdx
 
 rocsolver_<type>heevdx_batched()
diff --git a/docs/reference/tuning.rst b/docs/reference/tuning.rst
index 82cba73d2..8b59e356a 100644
--- a/docs/reference/tuning.rst
+++ b/docs/reference/tuning.rst
@@ -180,14 +180,18 @@ GEBRD_GEBD2_SWITCHSIZE
 bdsqr function
 ==================
 
-The Singular Value Decomposition of a bidiagonal matrix could be sped up by splitting the matrix into diagonal blocks
-and processing those blocks in parallel.
+The Singular Value Decomposition of a bidiagonal matrix could be executed with one or multiple thread blocks, and it is
+a blocking API that requires synchronization with the host.
 
-BDSQR_SPLIT_GROUPS
+BDSQR_SWITCH_SIZE
 -------------------
-.. doxygendefine:: BDSQR_SPLIT_GROUPS
+.. doxygendefine:: BDSQR_SWITCH_SIZE
 
-(As of the current rocSOLVER release, this constant has not been tuned for any specific cases.)
+BDSQR_ITERS_PER_SYNC
+----------------------
+.. doxygendefine:: BDSQR_ITERS_PER_SYNC
+
+(As of the current rocSOLVER release, this constants have not been tuned for any specific cases.)
 
 
 
diff --git a/docs/reference/types.rst b/docs/reference/types.rst
index 6706f0e23..0fb8d716c 100644
--- a/docs/reference/types.rst
+++ b/docs/reference/types.rst
@@ -70,3 +70,11 @@ rocsolver_rfinfo
 rocsolver_rfinfo_mode
 ------------------------
 .. doxygenenum:: rocsolver_rfinfo_mode
+
+rocsolver_alg_mode
+------------------------
+.. doxygenenum:: rocsolver_alg_mode
+
+rocsolver_function
+------------------------
+.. doxygenenum:: rocsolver_function
diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in
index e5d3606ce..6ad32dfe3 100644
--- a/docs/sphinx/_toc.yml.in
+++ b/docs/sphinx/_toc.yml.in
@@ -23,7 +23,7 @@ subtrees:
       - file: reference/lapack.rst
       - file: reference/lapacklike.rst
       - file: reference/refact.rst
-      - file: reference/logging.rst
+      - file: reference/helpers.rst
       - file: reference/tuning.rst
       - file: reference/deprecated.rst
   - file: license.rst
diff --git a/library/include/rocsolver/rocsolver-aliases.h b/library/include/rocsolver/rocsolver-aliases.h
index fec1ea401..de938bf22 100644
--- a/library/include/rocsolver/rocsolver-aliases.h
+++ b/library/include/rocsolver/rocsolver-aliases.h
@@ -1,5 +1,5 @@
 /* **************************************************************************
- * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -91,7 +91,7 @@ ROCSOLVER_DEPRECATED_X("use rocblas_fill") typedef rocblas_fill rocsolver_fill;
  */
 ROCSOLVER_DEPRECATED_X("use rocblas_diagonal") typedef rocblas_diagonal rocsolver_diagonal;
 
-/*! \deprecated Use \c rocblas_stide.
+/*! \deprecated Use \c rocblas_side.
  */
 ROCSOLVER_DEPRECATED_X("use rocblas_side") typedef rocblas_side rocsolver_side;
 
diff --git a/library/include/rocsolver/rocsolver-extra-types.h b/library/include/rocsolver/rocsolver-extra-types.h
index a22d62ce9..3335f8f3d 100644
--- a/library/include/rocsolver/rocsolver-extra-types.h
+++ b/library/include/rocsolver/rocsolver-extra-types.h
@@ -173,4 +173,21 @@ typedef enum rocblas_pivot_
     rocblas_pivot_bottom = 283, /**< The i-th rotation is applied on plane (i,m) or (i,n). */
 } rocblas_pivot;
 
+/*! \brief Used by specific functions to specify the algorithm mode.
+ ********************************************************************************/
+typedef enum rocsolver_alg_mode_
+{
+    rocsolver_alg_mode_gpu
+    = 291, /**< Computations are all performed on the GPU. This is the default mode. */
+    rocsolver_alg_mode_hybrid = 292, /**< Computations are performed on the CPU and GPU. */
+} rocsolver_alg_mode;
+
+/*! \brief Used to specify a function with multiple supported algorithm modes.
+ ********************************************************************************/
+typedef enum rocsolver_function_
+{
+    rocsolver_function_bdsqr = 401,
+    rocsolver_function_gesvd = 402,
+} rocsolver_function;
+
 #endif /* ROCSOLVER_EXTRA_TYPES_H */
diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h
index 1964166d8..bd9a0592d 100644
--- a/library/include/rocsolver/rocsolver-functions.h
+++ b/library/include/rocsolver/rocsolver-functions.h
@@ -141,6 +141,45 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_log_write_profile(void);
 
 ROCSOLVER_EXPORT rocblas_status rocsolver_log_flush_profile(void);
 
+/*
+ * ===========================================================================
+ *      Hybrid algorithm enablement
+ * ===========================================================================
+ */
+
+/*! \brief SET_ALG_MODE sets the algorithm mode to be used by the specified function.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    func        #rocsolver_function.
+                The function that will use the selected algorithm mode.
+    @param[in]
+    mode        #rocsolver_alg_mode.
+                The algorithm mode that will be used by the specified function.
+    *************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_set_alg_mode(rocblas_handle handle,
+                                                       const rocsolver_function func,
+                                                       const rocsolver_alg_mode mode);
+
+/*! \brief GET_ALG_MODE gets the algorithm mode being used by the specified function.
+
+    @param[in]
+    handle      rocblas_handle.
+    @param[in]
+    func        #rocsolver_function.
+                The specified function.
+    @param[out]
+    mode        pointer to #rocsolver_alg_mode.
+                On exit, the value is overwritten by the algorithm mode used
+                by the specified function.
+    *************************************************************************/
+
+ROCSOLVER_EXPORT rocblas_status rocsolver_get_alg_mode(rocblas_handle handle,
+                                                       const rocsolver_function func,
+                                                       rocsolver_alg_mode* mode);
+
 /*
  * ===========================================================================
  *      Auxiliary functions
@@ -3831,6 +3870,10 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zunmtr(rocblas_handle handle,
     In order to carry out calculations, this method may synchronize the stream contained within the
     rocblas_handle.
 
+    \note
+    A hybrid (CPU+GPU) approach is available for BDSQR, primarily intended for homogeneous architectures.
+    Use \ref rocsolver_set_alg_mode to enable it.
+
     @param[in]
     handle      rocblas_handle.
     @param[in]
@@ -12566,8 +12609,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zpotri_strided_batched(rocblas_handle
     the "Tuning rocSOLVER performance" and "Memory model" sections of the documentation.
 
     \note
-    In order to carry out calculations, this method may synchronize the stream contained within the
-    rocblas_handle.
+    In order to carry out calculations, this method may synchronize the stream contained
+    within the rocblas_handle.
+
+    \note
+    A hybrid (CPU+GPU) approach is available for GESVD, primarily intended for homogeneous architectures.
+    Use \ref rocsolver_set_alg_mode to enable it.
 
     @param[in]
     handle      rocblas_handle.
@@ -12743,8 +12790,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd(rocblas_handle handle,
     and "Memory model" sections of the documentation.
 
     \note
-    In order to carry out calculations, this method may synchronize the stream contained within the
-    rocblas_handle.
+    In order to carry out calculations, this method may synchronize the stream contained
+    within the rocblas_handle.
+
+    \note
+    A hybrid (CPU+GPU) approach is available for GESVD_BATCHED, primarily intended for
+    homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it.
 
     @param[in]
     handle      rocblas_handle.
@@ -12961,8 +13012,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd_batched(rocblas_handle handle,
     and "Memory model" sections of the documentation.
 
     \note
-    In order to carry out calculations, this method may synchronize the stream contained within the
-    rocblas_handle.
+    In order to carry out calculations, this method may synchronize the stream contained
+    within the rocblas_handle.
+
+    \note
+    A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED, primarily intended
+    for homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it.
 
     @param[in]
     handle      rocblas_handle.
diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt
index 5a6203457..e4033ea06 100755
--- a/library/src/CMakeLists.txt
+++ b/library/src/CMakeLists.txt
@@ -345,6 +345,7 @@ endif()
 
 set(auxiliaries
   common/buildinfo.cpp
+  common/rocsolver_handle.cpp
   common/rocsolver_logger.cpp
   common/rocsparse.cpp
 )
diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
index b89fd297d..c19ae4f9b 100644
--- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp
+++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp
@@ -38,6 +38,8 @@
 
 #include <cmath>
 
+#include "rocauxiliary_bdsqr_hybrid.hpp"
+
 ROCSOLVER_BEGIN_NAMESPACE
 
 /******************** Device functions *************************/
@@ -1207,6 +1209,9 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle,
     hipStream_t stream;
     rocblas_get_stream(handle, &stream);
 
+    rocsolver_alg_mode alg_mode;
+    ROCBLAS_CHECK(rocsolver_get_alg_mode(handle, rocsolver_function_bdsqr, &alg_mode));
+
     // set tolerance and max number of iterations:
     // machine precision (considering rounding strategy)
     S eps = get_epsilon<S>() / 2;
@@ -1251,70 +1256,81 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle,
 
     if(n > 1)
     {
-        // rotate to upper bidiagonal if necessary
-        if(uplo == rocblas_fill_lower)
+        if(alg_mode == rocsolver_alg_mode_hybrid)
         {
-            ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper<T>), gridBasic, threadsUC, 0, stream, n, nu,
-                                    nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C, shiftC,
-                                    ldc, strideC, info, work, strideW, completed);
+            ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template<T, S, W1, W2, W3, rocblas_int>(
+                handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U,
+                shiftU, ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work));
         }
-
-        rocblas_int h_iter = 0;
-        struct
+        else
         {
-            rocblas_int completed;
-            rocblas_int num_splits;
-        } h_params;
+            // rotate to upper bidiagonal if necessary
+            if(uplo == rocblas_fill_lower)
+            {
+                ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper<T>), gridBasic, threadsUC, 0, stream, n,
+                                        nu, nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C,
+                                        shiftC, ldc, strideC, info, work, strideW, completed);
+            }
 
-        while(h_iter < maxiter)
-        {
-            // if all instances in the batch have finished, exit the loop
-            HIP_CHECK(hipMemcpyAsync(&h_params, completed, sizeof(h_params), hipMemcpyDeviceToHost,
-                                     stream));
-            HIP_CHECK(hipStreamSynchronize(stream));
+            rocblas_int h_iter = 0;
+            struct
+            {
+                rocblas_int completed;
+                rocblas_int num_splits;
+            } h_params;
 
-            if(h_params.completed == batch_count)
-                break;
+            while(h_iter < maxiter)
+            {
+                // if all instances in the batch have finished, exit the loop
+                HIP_CHECK(hipMemcpyAsync(&h_params, completed, sizeof(h_params),
+                                         hipMemcpyDeviceToHost, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
 
-            dim3 gridSplits(1, h_params.num_splits, batch_count);
-            dim3 gridVUC((nvuc_max - 1) / BS1 + 1, h_params.num_splits, batch_count);
+                if(h_params.completed == batch_count)
+                    break;
 
-            for(rocblas_int inner_iters = 0; inner_iters < BDSQR_ITERS_PER_SYNC; inner_iters++)
-            {
-                if(nvuc_max <= BDSQR_SWITCH_SIZE)
-                {
-                    // main computation of SVD
-                    ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute<BS1, T>), gridSplits, threadsBS1, 0,
-                                            stream, n, nv, nu, nc, D, strideD, E, strideE, V,
-                                            shiftV, ldv, strideV, U, shiftU, ldu, strideU, C,
-                                            shiftC, ldc, strideC, maxiter, eps, sfm, tol, minshift,
-                                            splits_map, work, incW, strideW, completed);
-                }
-                else
+                dim3 gridSplits(1, h_params.num_splits, batch_count);
+                dim3 gridVUC((nvuc_max - 1) / BS1 + 1, h_params.num_splits, batch_count);
+
+                for(rocblas_int inner_iters = 0; inner_iters < BDSQR_ITERS_PER_SYNC; inner_iters++)
                 {
-                    // main computation of SVD
-                    ROCSOLVER_LAUNCH_KERNEL(
-                        (bdsqr_compute<BS1, T>), gridSplits, threadsBS1, 0, stream, n, nv, nu, nc,
-                        D, strideD, E, strideE, (W1) nullptr, shiftV, ldv, strideV, (W2) nullptr,
-                        shiftU, ldu, strideU, (W3) nullptr, shiftC, ldc, strideC, maxiter, eps, sfm,
-                        tol, minshift, splits_map, work, incW, strideW, completed);
-
-                    // update singular vectors
-                    ROCSOLVER_LAUNCH_KERNEL((bdsqr_rotate<T>), gridVUC, threadsVUC, 0, stream, n,
-                                            nv, nu, nc, V, shiftV, ldv, strideV, U, shiftU, ldu,
-                                            strideU, C, shiftC, ldc, strideC, maxiter, splits_map,
-                                            work, incW, strideW, completed);
+                    if(nvuc_max <= BDSQR_SWITCH_SIZE)
+                    {
+                        // main computation of SVD
+                        ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute<BS1, T>), gridSplits, threadsBS1, 0,
+                                                stream, n, nv, nu, nc, D, strideD, E, strideE, V,
+                                                shiftV, ldv, strideV, U, shiftU, ldu, strideU, C,
+                                                shiftC, ldc, strideC, maxiter, eps, sfm, tol,
+                                                minshift, splits_map, work, incW, strideW, completed);
+                    }
+                    else
+                    {
+                        // main computation of SVD
+                        ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute<BS1, T>), gridSplits, threadsBS1, 0,
+                                                stream, n, nv, nu, nc, D, strideD, E, strideE,
+                                                (W1) nullptr, shiftV, ldv, strideV, (W2) nullptr,
+                                                shiftU, ldu, strideU, (W3) nullptr, shiftC, ldc,
+                                                strideC, maxiter, eps, sfm, tol, minshift,
+                                                splits_map, work, incW, strideW, completed);
+
+                        // update singular vectors
+                        ROCSOLVER_LAUNCH_KERNEL((bdsqr_rotate<T>), gridVUC, threadsVUC, 0, stream,
+                                                n, nv, nu, nc, V, shiftV, ldv, strideV, U, shiftU,
+                                                ldu, strideU, C, shiftC, ldc, strideC, maxiter,
+                                                splits_map, work, incW, strideW, completed);
+                    }
+
+                    // update split block endpoints
+                    ROCSOLVER_LAUNCH_KERNEL((bdsqr_update_endpoints<T>), gridSplits, threadsBasic,
+                                            0, stream, n, E, strideE, splits_map, work, strideW,
+                                            completed);
                 }
 
-                // update split block endpoints
-                ROCSOLVER_LAUNCH_KERNEL((bdsqr_update_endpoints<T>), gridSplits, threadsBasic, 0,
-                                        stream, n, E, strideE, splits_map, work, strideW, completed);
+                // check for completion
+                h_iter += BDSQR_ITERS_PER_SYNC;
+                ROCSOLVER_LAUNCH_KERNEL((bdsqr_chk_completed<T>), gridBasic, threadsBasic, 0,
+                                        stream, n, maxiter, splits_map, work, strideW, completed);
             }
-
-            // check for completion
-            h_iter += BDSQR_ITERS_PER_SYNC;
-            ROCSOLVER_LAUNCH_KERNEL((bdsqr_chk_completed<T>), gridBasic, threadsBasic, 0, stream, n,
-                                    maxiter, splits_map, work, strideW, completed);
         }
     }
 
diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
new file mode 100644
index 000000000..69584cc01
--- /dev/null
+++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp
@@ -0,0 +1,1662 @@
+
+/****************************************************************************
+ * Derived from the BSD3-licensed
+ * LAPACK routine (version 3.7.1) --
+ *     Univ. of Tennessee, Univ. of California Berkeley,
+ *     Univ. of Colorado Denver and NAG Ltd..
+ *     June 2017
+ * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include "lapack_host_functions.hpp"
+#include "rocauxiliary_lasr.hpp"
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+/************************************************************************************/
+/***************** Kernel launchers *************************************************/
+/************************************************************************************/
+
+template <typename S, typename T, typename I>
+static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((swap_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, x, incx, y, incy);
+}
+
+template <typename S, typename T, typename I>
+static void
+    rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((rot_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, x, incx, y, incy, c, s);
+}
+
+template <typename S, typename T, typename I>
+static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream)
+{
+    auto nthreads = warpSize * 2;
+    auto nblocks = (n - 1) / nthreads + 1;
+
+    hipLaunchKernelGGL((scal_kernel<S, T, I>), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream,
+                       n, da, x, incx);
+}
+
+/** Call to lasr functionality.
+    lasr_body can be executed as a host or device function **/
+template <typename S, typename T, typename I>
+static void call_lasr(rocblas_side& side,
+                      rocblas_pivot& pivot,
+                      rocblas_direct& direct,
+                      I& m,
+                      I& n,
+                      S& c,
+                      S& s,
+                      T& A,
+                      I& lda)
+{
+    I const tid = 0;
+    I const i_inc = 1;
+
+    lasr_body<T, S, I>(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc);
+}
+
+/************************************************************************************/
+/***************** Main template functions ******************************************/
+/************************************************************************************/
+
+template <typename S, typename T, typename I>
+static void bdsqr_single_template(rocblas_handle handle,
+                                  char uplo,
+                                  I n,
+                                  I ncvt,
+                                  I nru,
+                                  I ncc,
+                                  S* d_,
+                                  S* e_,
+                                  T* vt_,
+                                  I ldvt,
+                                  T* u_,
+                                  I ldu,
+                                  T* c_,
+                                  I ldc,
+                                  S* work_,
+                                  I& info,
+                                  S* dwork_ = nullptr,
+                                  hipStream_t stream = 0)
+{
+    // -------------------------------------
+    // Lambda expressions used as helpers
+    // -------------------------------------
+    auto call_swap_gpu = [=](I n, T& x, I incx, T& y, I incy) {
+        swap_template<S, T, I>(n, &x, incx, &y, incy, stream);
+    };
+
+    auto call_rot_gpu = [=](I n, T& x, I incx, T& y, I incy, S cosl, S sinl) {
+        rot_template<S, T, I>(n, &x, incx, &y, incy, cosl, sinl, stream);
+    };
+
+    auto call_scal_gpu
+        = [=](I n, auto da, T& x, I incx) { scal_template<S, T, I>(n, da, &x, incx, stream); };
+
+    auto call_lasr_gpu_nocopy
+        = [=](rocblas_side const side, rocblas_pivot const pivot, rocblas_direct const direct,
+              I const m, I const n, S& dc, S& ds, T& A, I const lda, hipStream_t stream) {
+              bool const is_left_side = (side == rocblas_side_left);
+              auto const mn = (is_left_side) ? m : n;
+              auto const mn_m1 = (mn - 1);
+
+              rocsolver_lasr_template<T, S>(handle, side, pivot, direct, m, n, &dc, 0, &ds, 0, &A,
+                                            0, lda, 0, I(1));
+          };
+
+    auto call_lasr_gpu = [=](rocblas_side const side, rocblas_pivot const pivot,
+                             rocblas_direct const direct, I const m, I const n, S& c, S& s, T& A,
+                             I const lda, S* const dwork_, hipStream_t stream) {
+        bool const is_left_side = (side == rocblas_side_left);
+        auto const mn = (is_left_side) ? m : n;
+        auto const mn_m1 = (mn - 1);
+        S* const dc = dwork_;
+        S* const ds = dwork_ + mn_m1;
+        CHECK_HIP(hipStreamSynchronize(stream));
+
+        CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
+        CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream));
+
+        rocsolver_lasr_template<T, S>(handle, side, pivot, direct, m, n, dc, 0, ds, 0, &A, 0, lda,
+                                      0, I(1));
+        CHECK_HIP(hipStreamSynchronize(stream));
+    };
+
+    auto d = [=](auto i) -> S& {
+        assert((1 <= i) && (i <= n));
+        return (d_[i - 1]);
+    };
+
+    auto e = [=](auto i) -> S& {
+        assert((1 <= i) && (i <= (n - 1)));
+        return (e_[i - 1]);
+    };
+
+    auto work = [=](auto i) -> S& { return (work_[i - 1]); };
+
+    auto dwork = [=](auto i) -> S& { return (dwork_[i - 1]); };
+
+    auto c = [=](auto i, auto j) -> T& {
+        assert((1 <= i) && (i <= nrc) && (nrc <= ldc));
+        assert((1 <= j) && (j <= ncc));
+        return (c_[idx2D(i - 1, j - 1, ldc)]);
+    };
+
+    auto u = [=](auto i, auto j) -> T& {
+        assert((1 <= i) && (i <= nru) && (nru <= ldu));
+        assert((1 <= j) && (j <= ncu));
+        return (u_[idx2D(i - 1, j - 1, ldu)]);
+    };
+
+    auto vt = [=](auto i, auto j) -> T& {
+        assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt));
+        assert((1 <= j) && (j <= ncvt));
+        return (vt_[idx2D(i - 1, j - 1, ldvt)]);
+    };
+
+    auto sign = [](auto a, auto b) {
+        auto const abs_a = std::abs(a);
+        return ((b >= 0) ? abs_a : -abs_a);
+    };
+
+    auto dble = [](auto x) { return (static_cast<double>(x)); };
+    // -------------------------------
+
+    // ----------------
+    // Initialization
+    // ----------------
+    bool const use_gpu = (dwork_ != nullptr);
+
+    // Lapack code used O(n^2) algorithm for sorting
+    // Consider turning off this and rely on
+    // bdsqr_sort() to perform sorting
+    bool constexpr need_sort = false;
+
+    S const zero = 0;
+    S const one = 1;
+    S negone = -1;
+    S const hndrd = 100;
+    S const hndrth = one / hndrd;
+    S const ten = 10;
+    S const eight = 8;
+    S const meight = -one / eight;
+    I const maxitr = 6;
+    I ione = 1;
+
+    bool const lower = (uplo == 'L') || (uplo == 'l');
+    bool const upper = (uplo == 'U') || (uplo == 'u');
+
+    //rotate is true if any singular vectors desired, false otherwise
+    bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0);
+
+    I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0,
+      maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0;
+
+    I const nrc = n; // number of rows in C matrix
+    I const nrvt = n; // number of rows in VT matrix
+    I const ncu = n; // number of columns in U matrix
+
+    S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, h = 0, mu = 0,
+      oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, sinl = 0, sinr = 0, sll = 0,
+      smax = 0, smin = 0, sminl = 0, sminoa = 0, sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0;
+
+    bool const need_update_singular_vectors = (nru > 0) || (ncc > 0);
+    bool constexpr use_lasr_gpu_nocopy = false;
+
+    nm1 = n - 1;
+    nm12 = nm1 + nm1;
+    nm13 = nm12 + nm1;
+    idir = 0;
+
+    //get machine constants
+    {
+        char cmach_eps = 'E';
+        char cmach_unfl = 'S';
+        call_lamch(cmach_eps, eps);
+        call_lamch(cmach_unfl, unfl);
+    }
+
+    // -----------------------------------------
+    // rotate to upper bidiagonal if necesarry
+    // -----------------------------------------
+    if(lower)
+    {
+        for(i = 1; i <= (n - 1); i++)
+        {
+            call_lartg(d(i), e(i), cs, sn, r);
+            d(i) = r;
+            e(i) = sn * d(i + 1);
+            d(i + 1) = cs * d(i + 1);
+            work(i) = cs;
+            work(nm1 + i) = sn;
+        }
+
+        // update singular vectors if desired
+        if(use_lasr_gpu_nocopy)
+        {
+            CHECK_HIP(hipStreamSynchronize(stream));
+
+            if(need_update_singular_vectors)
+            {
+                // copy rotations
+                size_t const nbytes = sizeof(S) * (n - 1);
+                hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+                {
+                    void* const src = (void*)&(work(1));
+                    void* const dst = (void*)&(dwork(1));
+                    CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                }
+
+                {
+                    void* const src = (void*)&(work(n));
+                    void* const dst = (void*)&(dwork(n));
+                    CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                }
+            }
+            CHECK_HIP(hipStreamSynchronize(stream));
+        }
+
+        if(nru > 0)
+        {
+            // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu );
+            rocblas_side side = rocblas_side_right;
+            rocblas_pivot pivot = rocblas_pivot_variable;
+            rocblas_direct direct = rocblas_forward_direction;
+            if(use_gpu)
+            {
+                if(use_lasr_gpu_nocopy)
+                {
+                    call_lasr_gpu_nocopy(side, pivot, direct, nru, n, dwork(1), dwork(n), u(1, 1),
+                                         ldu, stream);
+                }
+                else
+                {
+                    call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu,
+                                  dwork_, stream);
+                }
+            }
+            else
+            {
+                call_lasr(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu);
+            }
+        }
+        if(ncc > 0)
+        {
+            // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc );
+            rocblas_side side = rocblas_side_left;
+            rocblas_pivot pivot = rocblas_pivot_variable;
+            rocblas_direct direct = rocblas_forward_direction;
+            if(use_gpu)
+            {
+                if(use_lasr_gpu_nocopy)
+                {
+                    call_lasr_gpu_nocopy(side, pivot, direct, n, ncc, dwork(1), dwork(n), c(1, 1),
+                                         ldc, stream);
+                }
+                else
+                {
+                    call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc,
+                                  dwork_, stream);
+                }
+            }
+            else
+            {
+                call_lasr(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc);
+            }
+        }
+    }
+
+    // -------------------------------------------------------------
+    // Compute singular values and vector to relative accuracy tol
+    // -------------------------------------------------------------
+    tolmul = std::max(ten, std::min(hndrd, pow(eps, meight)));
+    tol = tolmul * eps;
+
+    // compute approximate maximum, minimum singular values
+    smax = zero;
+    for(i = 1; i <= n; i++)
+    {
+        smax = std::max(smax, std::abs(d(i)));
+    }
+L20:
+    for(i = 1; i <= (n - 1); i++)
+    {
+        smax = std::max(smax, std::abs(e(i)));
+    }
+
+    // compute tolerance
+L30:
+    sminl = zero;
+    if(tol >= zero)
+    {
+        // relative accuracy desired
+        sminoa = std::abs(d(1));
+        if(sminoa == zero)
+            goto L50;
+        mu = sminoa;
+        for(i = 2; i <= n; i++)
+        {
+            mu = std::abs(d(i)) * (mu / (mu + std::abs(e(i - 1))));
+            sminoa = std::min(sminoa, mu);
+            if(sminoa == zero)
+                goto L50;
+        }
+    L40:
+    L50:
+        sminoa = sminoa / std::sqrt(dble(n));
+        thresh = std::max(tol * sminoa, ((unfl * n) * n) * maxitr);
+    }
+    else
+    {
+        //absolute accuracy desired
+        thresh = std::max(std::abs(tol) * smax, ((unfl * n) * n) * maxitr);
+    }
+
+    /** prepare for main iteration loop for the singular values
+        (maxit is the maximum number of passes through the inner
+        loop permitted before nonconvergence signalled.) **/
+    maxitdivn = maxitr * n;
+    iterdivn = 0;
+    iter = -1;
+    oldll = -1;
+    oldm = -1;
+    m = n;
+
+    ///////////////////////////
+    /// MAIN ITERATION LOOP ///
+    ///////////////////////////
+L60:
+    // check for convergence or exceeding iteration count
+    if(m <= 1)
+        goto L160;
+
+    if(iter >= n)
+    {
+        iter = iter - n;
+        iterdivn = iterdivn + 1;
+        if(iterdivn >= maxitdivn)
+            goto L200;
+    }
+
+    // find diagonal block of matrix to work on
+    if(tol < zero && std::abs(d(m)) <= thresh)
+        d(m) = zero;
+
+    smax = std::abs(d(m));
+    smin = smax;
+    for(lll = 1; lll <= (m - 1); lll++)
+    {
+        ll = m - lll;
+        abss = std::abs(d(ll));
+        abse = std::abs(e(ll));
+        if(tol < zero && abss <= thresh)
+            d(ll) = zero;
+        if(abse <= thresh)
+            goto L80;
+        smin = std::min(smin, abss);
+        smax = std::max(smax, std::max(abss, abse));
+    }
+
+L70:
+    ll = 0;
+    goto L90;
+
+L80:
+    e(ll) = zero;
+    // matrix splits since e(ll) = 0
+    if(ll == m - 1)
+    {
+        // convergence of bottom singular value, return to top of loop
+        m = m - 1;
+        goto L60;
+    }
+
+L90:
+    ll = ll + 1;
+    // e(ll) through e(m-1) are nonzero, e(ll-1) is zero
+    if(ll == m - 1)
+    {
+        // 2 by 2 block, handle separately
+        call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl);
+        d(m - 1) = sigmx;
+        e(m - 1) = zero;
+        d(m) = sigmn;
+
+        // compute singular vectors, if desired
+        if(ncvt > 0)
+        {
+            if(use_gpu)
+            {
+                call_rot_gpu(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr);
+            }
+            else
+            {
+                call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr);
+            }
+        }
+        if(nru > 0)
+        {
+            if(use_gpu)
+            {
+                call_rot_gpu(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl);
+            }
+            else
+            {
+                call_rot(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl);
+            }
+        }
+        if(ncc > 0)
+        {
+            if(use_gpu)
+            {
+                call_rot_gpu(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl);
+            }
+            else
+            {
+                call_rot(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl);
+            }
+        }
+        m = m - 2;
+        goto L60;
+    }
+
+    /** if working on new submatrix, choose shift direction
+        (from larger end diagonal element towards smaller) **/
+    if(ll > oldm || m < oldll)
+    {
+        if(std::abs(d(ll)) >= std::abs(d(m)))
+        {
+            // chase bulge from top (big end) to bottom (small end)
+            idir = 1;
+        }
+        else
+        {
+            // chase bulge from bottom (big end) to top (small end)
+            idir = 2;
+        }
+    }
+
+    // apply convergence test
+    if(idir == 1)
+    {
+        // run convergence test in forward direction
+        // first apply standard test to bottom of matrix
+        if(std::abs(e(m - 1)) <= std::abs(tol) * std::abs(d(m))
+           || (tol < zero && std::abs(e(m - 1)) <= thresh))
+        {
+            e(m - 1) = zero;
+            goto L60;
+        }
+
+        if(tol >= zero)
+        {
+            // if relative accuracy desired,
+            // apply convergence criterion forward
+            mu = std::abs(d(ll));
+            sminl = mu;
+            for(lll = ll; lll <= (m - 1); lll++)
+            {
+                if(std::abs(e(lll)) <= tol * mu)
+                {
+                    e(lll) = zero;
+                    goto L60;
+                }
+                mu = std::abs(d(lll + 1)) * (mu / (mu + std::abs(e(lll))));
+                sminl = std::min(sminl, mu);
+            }
+        }
+    }
+    else
+    {
+        // run convergence test in backward direction
+        // first apply standard test to top of matrix
+        if(std::abs(e(ll)) <= std::abs(tol) * std::abs(d(ll))
+           || (tol < zero && std::abs(e(ll)) <= thresh))
+        {
+            e(ll) = zero;
+            goto L60;
+        }
+
+        if(tol >= zero)
+        {
+            // if relative accuracy desired,
+            // apply convergence criterion backward
+            mu = std::abs(d(m));
+            sminl = mu;
+            for(lll = (m - 1); lll >= ll; lll--)
+            {
+                if(std::abs(e(lll)) <= tol * mu)
+                {
+                    e(lll) = zero;
+                    goto L60;
+                }
+                mu = std::abs(d(lll)) * (mu / (mu + std::abs(e(lll))));
+                sminl = std::min(sminl, mu);
+            }
+        }
+    }
+
+    /** compute shift.  first, test if shifting would ruin relative
+        accuracy, and if so set the shift to zero **/
+    oldll = ll;
+    oldm = m;
+    if(tol >= zero && n * tol * (sminl / smax) <= std::max(eps, hndrth * tol))
+    {
+        //use a zero shift to avoid loss of relative accuracy
+        shift = zero;
+    }
+    else
+    {
+        // compute the shift from 2-by-2 block at end of matrix
+        if(idir == 1)
+        {
+            sll = std::abs(d(ll));
+            call_las2(d(m - 1), e(m - 1), d(m), shift, r);
+        }
+        else
+        {
+            sll = std::abs(d(m));
+            call_las2(d(ll), e(ll), d(ll + 1), shift, r);
+        }
+        // test if shift negligible, and if so set to zero
+        if(sll > zero)
+        {
+            if((shift / sll) * (shift / sll) < eps)
+                shift = zero;
+        }
+    }
+
+    // increment iteration count
+    iter = iter + m - ll;
+
+    // if shift = 0, do simplified qr iteration
+    if(shift == zero)
+    {
+        if(idir == 1)
+        {
+            // chase bulge from top to bottom
+            // save cosines and sines for later singular vector updates
+            cs = one;
+            oldcs = one;
+            for(i = ll; i <= (m - 1); i++)
+            {
+                auto di_cs = d(i) * cs;
+                call_lartg(di_cs, e(i), cs, sn, r);
+                if(i > ll)
+                    e(i - 1) = oldsn * r;
+                auto oldcs_r = oldcs * r;
+                auto dip1_sn = d(i + 1) * sn;
+                call_lartg(oldcs_r, dip1_sn, oldcs, oldsn, d(i));
+                work(i - ll + 1) = cs;
+                work(i - ll + 1 + nm1) = sn;
+                work(i - ll + 1 + nm12) = oldcs;
+                work(i - ll + 1 + nm13) = oldsn;
+            }
+
+        L120:
+            h = d(m) * cs;
+            d(m) = h * oldcs;
+            e(m - 1) = h * oldsn;
+
+            //   update singular vectors
+            if(use_lasr_gpu_nocopy)
+            {
+                CHECK_HIP(hipStreamSynchronize(stream));
+
+                if(rotate)
+                {
+                    // copy rotations
+                    size_t const nbytes = sizeof(S) * (n - 1);
+                    hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+                    if(ncvt > 0)
+                    {
+                        {
+                            void* const src = (void*)&(work(1));
+                            void* const dst = (void*)&(dwork(1));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(n));
+                            void* const dst = (void*)&(dwork(n));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+
+                    if((nru > 0) || (ncc > 0))
+                    {
+                        {
+                            void* const src = (void*)&(work(nm12));
+                            void* const dst = (void*)&(dwork(nm12));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(nm13));
+                            void* const dst = (void*)&(dwork(nm13));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+                }
+
+                CHECK_HIP(hipStreamSynchronize(stream));
+            }
+
+            if(ncvt > 0)
+            {
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(ll, 1 ), ldvt )
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(1), dwork(n),
+                                             vt(ll, 1), ldvt, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1),
+                                      ldvt, dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt);
+                }
+            }
+            if(nru > 0)
+            {
+                // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 ), u( 1, ll ), ldu )
+                rocblas_side side = rocblas_side_right;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), u(1, ll), ldu, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                                      u(1, ll), ldu, dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                              u(1, ll), ldu);
+                }
+            }
+            if(ncc > 0)
+            {
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 ), c( ll, 1 ), ldc )
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), c(ll, 1), ldc, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                                      c(ll, 1), ldc, dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                              c(ll, 1), ldc);
+                }
+            }
+
+            // test convergence
+            if(std::abs(e(m - 1)) <= thresh)
+                e(m - 1) = zero;
+        }
+        else
+        {
+            // chase bulge from bottom to top
+            // save cosines and sines for later singular vector updates
+            cs = one;
+            oldcs = one;
+            for(i = m; i >= (ll + 1); i--)
+            {
+                auto di_cs = d(i) * cs;
+                call_lartg(di_cs, e(i - 1), cs, sn, r);
+
+                if(i < m)
+                    e(i) = oldsn * r;
+
+                auto oldcs_r = oldcs * r;
+                auto dim1_sn = d(i - 1) * sn;
+                call_lartg(oldcs_r, dim1_sn, oldcs, oldsn, d(i));
+
+                work(i - ll) = cs;
+                work(i - ll + nm1) = -sn;
+                work(i - ll + nm12) = oldcs;
+                work(i - ll + nm13) = -oldsn;
+            }
+
+        L130:
+            h = d(ll) * cs;
+            d(ll) = h * oldcs;
+            e(ll) = h * oldsn;
+
+            // update singular vectors
+            if(use_lasr_gpu_nocopy)
+            {
+                CHECK_HIP(hipStreamSynchronize(stream));
+
+                if(rotate)
+                {
+                    // copy rotations
+                    size_t const nbytes = sizeof(S) * (n - 1);
+                    hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+                    if((nru > 0) || (ncc > 0))
+                    {
+                        {
+                            void* const src = (void*)&(work(1));
+                            void* const dst = (void*)&(dwork(1));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(n));
+                            void* const dst = (void*)&(dwork(n));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+
+                    if(ncvt > 0)
+                    {
+                        {
+                            void* const src = (void*)&(work(nm12));
+                            void* const dst = (void*)&(dwork(nm12));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(nm13));
+                            void* const dst = (void*)&(dwork(nm13));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+                }
+
+                CHECK_HIP(hipStreamSynchronize(stream));
+            }
+
+            if(ncvt > 0)
+            {
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( nm13+1 ), vt( ll, 1 ), ldvt );
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), vt(ll, 1), ldvt, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                                      vt(ll, 1), ldvt, dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                              vt(ll, 1), ldvt);
+                }
+            }
+            if(nru > 0)
+            {
+                // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, ll ), ldu )
+                rocblas_side side = rocblas_side_right;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(1), dwork(n),
+                                             u(1, ll), ldu, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu,
+                                      dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu);
+                }
+            }
+            if(ncc > 0)
+            {
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, 1 ), ldc )
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(1), dwork(n),
+                                             c(ll, 1), ldc, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc,
+                                      dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc);
+                }
+            }
+
+            // test convergence
+            if(std::abs(e(ll)) <= thresh)
+                e(ll) = zero;
+        }
+    }
+
+    // otherwise use nonzero shift
+    else
+    {
+        if(idir == 1)
+        {
+            // chase bulge from top to bottom
+            // save cosines and sines for later singular vector updates
+            f = (std::abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll));
+            g = e(ll);
+            for(i = ll; i <= (m - 1); i++)
+            {
+                call_lartg(f, g, cosr, sinr, r);
+                if(i > ll)
+                    e(i - 1) = r;
+                f = cosr * d(i) + sinr * e(i);
+                e(i) = cosr * e(i) - sinr * d(i);
+                g = sinr * d(i + 1);
+                d(i + 1) = cosr * d(i + 1);
+                call_lartg(f, g, cosl, sinl, r);
+                d(i) = r;
+                f = cosl * e(i) + sinl * d(i + 1);
+                d(i + 1) = cosl * d(i + 1) - sinl * e(i);
+                if(i < m - 1)
+                {
+                    g = sinl * e(i + 1);
+                    e(i + 1) = cosl * e(i + 1);
+                }
+                work(i - ll + 1) = cosr;
+                work(i - ll + 1 + nm1) = sinr;
+                work(i - ll + 1 + nm12) = cosl;
+                work(i - ll + 1 + nm13) = sinl;
+            }
+
+        L140:
+            e(m - 1) = f;
+
+            // update singular vectors
+            if(use_lasr_gpu_nocopy)
+            {
+                CHECK_HIP(hipStreamSynchronize(stream));
+
+                if(rotate)
+                {
+                    // copy rotations
+                    size_t const nbytes = sizeof(S) * (n - 1);
+                    hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+                    if(ncvt > 0)
+                    {
+                        {
+                            void* const src = (void*)&(work(1));
+                            void* const dst = (void*)&(dwork(1));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(n));
+                            void* const dst = (void*)&(dwork(n));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+
+                    if((nru > 0) || (ncc > 0))
+                    {
+                        {
+                            void* const src = (void*)&(work(nm12));
+                            void* const dst = (void*)&(dwork(nm12));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(nm13));
+                            void* const dst = (void*)&(dwork(nm13));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+                }
+
+                CHECK_HIP(hipStreamSynchronize(stream));
+            }
+
+            if(ncvt > 0)
+            {
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(ll, 1 ), ldvt )
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(1), dwork(n),
+                                             vt(ll, 1), ldvt, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1),
+                                      ldvt, dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt);
+                }
+            }
+
+            if(nru > 0)
+            {
+                // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 ), u( 1, ll ), ldu )
+                rocblas_side side = rocblas_side_right;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), u(1, ll), ldu, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                                      u(1, ll), ldu, dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1),
+                              u(1, ll), ldu);
+                }
+            }
+            if(ncc > 0)
+            {
+                // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 ), c( ll, 1 ), ldc )
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_forward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), c(ll, 1), ldc, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                                      c(ll, 1), ldc, dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1),
+                              c(ll, 1), ldc);
+                }
+            }
+
+            // test convergence
+            if(std::abs(e(m - 1)) <= thresh)
+                e(m - 1) = zero;
+        }
+        else
+        {
+            // chase bulge from bottom to top
+            // save cosines and sines for later singular vector updates
+            f = (std::abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m));
+            g = e(m - 1);
+            for(i = m; i >= (ll + 1); i--)
+            {
+                call_lartg(f, g, cosr, sinr, r);
+                if(i < m)
+                    e(i) = r;
+                f = cosr * d(i) + sinr * e(i - 1);
+                e(i - 1) = cosr * e(i - 1) - sinr * d(i);
+                g = sinr * d(i - 1);
+                d(i - 1) = cosr * d(i - 1);
+                call_lartg(f, g, cosl, sinl, r);
+                d(i) = r;
+                f = cosl * e(i - 1) + sinl * d(i - 1);
+                d(i - 1) = cosl * d(i - 1) - sinl * e(i - 1);
+                if(i > ll + 1)
+                {
+                    g = sinl * e(i - 2);
+                    e(i - 2) = cosl * e(i - 2);
+                }
+                work(i - ll) = cosr;
+                work(i - ll + nm1) = -sinr;
+                work(i - ll + nm12) = cosl;
+                work(i - ll + nm13) = -sinl;
+            }
+
+        L150:
+            e(ll) = f;
+
+            // test convergence
+            if(std::abs(e(ll)) <= thresh)
+                e(ll) = zero;
+
+            // update singular vectors
+            if(use_lasr_gpu_nocopy)
+            {
+                CHECK_HIP(hipStreamSynchronize(stream));
+
+                if(rotate)
+                {
+                    // copy rotations
+                    size_t const nbytes = sizeof(S) * (n - 1);
+                    hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+                    if((nru > 0) || (ncc > 0))
+                    {
+                        {
+                            void* const src = (void*)&(work(1));
+                            void* const dst = (void*)&(dwork(1));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(n));
+                            void* const dst = (void*)&(dwork(n));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+
+                    if(ncvt > 0)
+                    {
+                        {
+                            void* const src = (void*)&(work(nm12));
+                            void* const dst = (void*)&(dwork(nm12));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+
+                        {
+                            void* const src = (void*)&(work(nm13));
+                            void* const dst = (void*)&(dwork(nm13));
+                            CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+                        }
+                    }
+                }
+
+                CHECK_HIP(hipStreamSynchronize(stream));
+            }
+
+            if(ncvt > 0)
+            {
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(nm13+1), vt( ll, 1 ), ldvt )
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(nm12 + 1),
+                                             dwork(nm13 + 1), vt(ll, 1), ldvt, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                                      vt(ll, 1), ldvt, dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1),
+                              vt(ll, 1), ldvt);
+                }
+            }
+            if(nru > 0)
+            {
+                // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, ll ), ldu )
+                rocblas_side side = rocblas_side_right;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(1), dwork(n),
+                                             u(1, ll), ldu, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu,
+                                      dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu);
+                }
+            }
+            if(ncc > 0)
+            {
+                // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, 1 ), ldc )
+                rocblas_side side = rocblas_side_left;
+                rocblas_pivot pivot = rocblas_pivot_variable;
+                rocblas_direct direct = rocblas_backward_direction;
+                auto mm = m - ll + 1;
+                if(use_gpu)
+                {
+                    if(use_lasr_gpu_nocopy)
+                    {
+                        call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(1), dwork(n),
+                                             c(ll, 1), ldc, stream);
+                    }
+                    else
+                    {
+                        call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc,
+                                      dwork_, stream);
+                    }
+                }
+                else
+                {
+                    call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc);
+                }
+            }
+        }
+    }
+    CHECK_HIP(hipStreamSynchronize(stream));
+
+    // qr iteration finished, go back and check convergence
+    goto L60;
+
+L160:
+    // all singular values converged, so make them positive
+    for(i = 1; i <= n; i++)
+    {
+        if(d(i) < zero)
+        {
+            d(i) = -d(i);
+            if(ncvt > 0)
+            {
+                if(use_gpu)
+                {
+                    call_scal_gpu(ncvt, negone, vt(i, 1), ldvt);
+                }
+                else
+                {
+                    call_scal(ncvt, negone, vt(i, 1), ldvt);
+                }
+            }
+        }
+    }
+
+L170:
+    // sort the singular values into decreasing order (insertion sort on
+    // singular values, but only one transposition per singular vector)
+    if(need_sort)
+    {
+        for(i = 1; i <= (n - 1); i++)
+        {
+            // scan for smallest d(i)
+            isub = 1;
+            smin = d(1);
+            for(j = 2; j <= (n + 1 - i); j++)
+            {
+                if(d(j) <= smin)
+                {
+                    isub = j;
+                    smin = d(j);
+                }
+            }
+        L180:
+            if(isub != n + 1 - i)
+            {
+                // swap singular values and vectors
+                d(isub) = d(n + 1 - i);
+                d(n + 1 - i) = smin;
+                if(ncvt > 0)
+                {
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+                    }
+                    else
+                    {
+                        call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt);
+                    }
+                }
+                if(nru > 0)
+                {
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
+                    }
+                    else
+                    {
+                        call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione);
+                    }
+                }
+                if(ncc > 0)
+                {
+                    if(use_gpu)
+                    {
+                        call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+                    }
+                    else
+                    {
+                        call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc);
+                    }
+                }
+            }
+        }
+    }
+
+L190:
+    goto L220;
+
+// maximum number of iterations exceeded, failure to converge
+L200:
+    info = 0;
+    for(i = 1; i <= (n - 1); i++)
+    {
+        if(e(i) != zero)
+            info = info + 1;
+    }
+
+L210:
+L220:
+    return;
+}
+
+template <typename T, typename S, typename W1, typename W2, typename W3, typename I = rocblas_int>
+rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle,
+                                                   const rocblas_fill uplo_in,
+                                                   const I n,
+                                                   const I nv,
+                                                   const I nu,
+                                                   const I nc,
+                                                   S* D,
+                                                   const rocblas_stride strideD,
+                                                   S* E,
+                                                   const rocblas_stride strideE,
+                                                   W1 V_arg,
+                                                   const I shiftV,
+                                                   const I ldv,
+                                                   const rocblas_stride strideV,
+                                                   W2 U_arg,
+                                                   const I shiftU,
+                                                   const I ldu,
+                                                   const rocblas_stride strideU,
+                                                   W3 C_arg,
+                                                   const I shiftC,
+                                                   const I ldc,
+                                                   const rocblas_stride strideC,
+                                                   I* info_array,
+                                                   const I batch_count,
+                                                   I* splits_map,
+                                                   S* work)
+{
+    // -------------------------------
+    // lambda expression as helper
+    // -------------------------------
+    auto is_device_pointer = [](void* ptr) -> bool {
+        hipPointerAttribute_t dev_attributes;
+        if(ptr == nullptr)
+        {
+            return (false);
+        }
+
+        auto istat = hipPointerGetAttributes(&dev_attributes, ptr);
+        if(istat != hipSuccess)
+        {
+            std::cout << "is_device_pointer: istat = " << istat << " " << hipGetErrorName(istat)
+                      << std::endl;
+        }
+        assert(istat == hipSuccess);
+        return (dev_attributes.type == hipMemoryTypeDevice);
+    };
+
+    // -------------------------
+    // copy D into hD, E into hE
+    // -------------------------
+    hipStream_t stream;
+    rocblas_get_stream(handle, &stream);
+
+    W1 V = V_arg;
+    W2 U = U_arg;
+    W3 C = C_arg;
+
+    // handle  batch case with array of pointers on device
+    std::vector<T*> Vp_array(batch_count);
+    std::vector<T*> Up_array(batch_count);
+    std::vector<T*> Cp_array(batch_count);
+
+    if(nv > 0)
+    {
+        bool const is_device_V_arg = is_device_pointer((void*)V_arg);
+        if(is_device_V_arg)
+        {
+            // note "T *" and "T * const" may be considered different types
+            bool constexpr is_array_of_device_pointers
+                = !(std::is_same<W1, T*>::value || std::is_same<W1, T* const>::value);
+            bool constexpr need_copy_W1 = is_array_of_device_pointers;
+            if constexpr(need_copy_W1)
+            {
+                size_t const nbytes = sizeof(T*) * batch_count;
+                void* const dst = (void*)&(Vp_array[0]);
+                void* const src = (void*)V_arg;
+                HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
+                V = &(Vp_array[0]);
+            }
+        }
+    }
+
+    if(nu > 0)
+    {
+        bool const is_device_U_arg = is_device_pointer((void*)U_arg);
+        if(is_device_U_arg)
+        {
+            bool constexpr is_array_of_device_pointers
+                = !(std::is_same<W2, T*>::value || std::is_same<W2, T* const>::value);
+            bool constexpr need_copy_W2 = is_array_of_device_pointers;
+            if constexpr(need_copy_W2)
+            {
+                size_t const nbytes = sizeof(T*) * batch_count;
+                void* const dst = (void*)&(Up_array[0]);
+                void* const src = (void*)U_arg;
+                HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
+                U = &(Up_array[0]);
+            }
+        }
+    }
+
+    if(nc > 0)
+    {
+        bool const is_device_C_arg = is_device_pointer((void*)C_arg);
+        if(is_device_C_arg)
+        {
+            bool constexpr is_array_of_device_pointers
+                = !(std::is_same<W3, T*>::value || std::is_same<W3, T* const>::value);
+            bool constexpr need_copy_W3 = is_array_of_device_pointers;
+            if constexpr(need_copy_W3)
+            {
+                size_t const nbytes = sizeof(T*) * batch_count;
+                void* const dst = (void*)&(Cp_array[0]);
+                void* const src = (void*)C_arg;
+                HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream));
+                HIP_CHECK(hipStreamSynchronize(stream));
+                C = &(Cp_array[0]);
+            }
+        }
+    }
+
+    S* hD = nullptr;
+    S* hE = nullptr;
+
+    size_t const E_size = (n - 1);
+    HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n));
+    HIP_CHECK(hipHostMalloc(&hE, (sizeof(S) * std::max(1, batch_count)) * E_size));
+
+    // ----------------------------------------------------
+    // copy info_array[] on device to linfo_array[] on host
+    // ----------------------------------------------------
+    I* linfo_array = nullptr;
+    HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * std::max(1, batch_count)));
+    {
+        void* const dst = &(linfo_array[0]);
+        void* const src = &(info_array[0]);
+        size_t const nbytes = sizeof(I) * batch_count;
+        hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+
+        HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+    }
+
+    S* hwork = nullptr;
+    HIP_CHECK(hipHostMalloc(&hwork, sizeof(S) * (4 * n)));
+
+    // -------------------------------------------------
+    // transfer arrays D(:) and E(:) from Device to Host
+    // -------------------------------------------------
+    bool const use_single_copy_for_D = (batch_count == 1) || (strideD == n);
+    if(use_single_copy_for_D)
+    {
+        void* const dst = (void*)&(hD[0]);
+        void* const src = (void*)&(D[0]);
+        size_t const sizeBytes = sizeof(S) * n * batch_count;
+        hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+
+        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+    }
+    else
+    {
+        for(I bid = 0; bid < batch_count; bid++)
+        {
+            void* const dst = (void*)&(hD[bid * n]);
+            void* const src = (void*)&(D[bid * strideD]);
+            size_t const sizeBytes = sizeof(S) * n;
+            hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+        }
+    }
+
+    {
+        for(I bid = 0; bid < batch_count; bid++)
+        {
+            void* const dst = (void*)&(hE[bid * E_size]);
+            void* const src = (void*)&(E[bid * strideE]);
+            size_t const sizeBytes = sizeof(S) * E_size;
+            hipMemcpyKind const kind = hipMemcpyDeviceToHost;
+            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+        }
+    }
+
+    // -------------------------------------------------
+    // Execute for each instance in the batch
+    // -------------------------------------------------
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    S* dwork_ = nullptr;
+    HIP_CHECK(hipMalloc(&dwork_, sizeof(S) * (4 * n)));
+
+    for(I bid = 0; bid < batch_count; bid++)
+    {
+        if(linfo_array[bid] != 0)
+        {
+            continue;
+        }
+
+        char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U';
+        S* d_ = &(hD[bid * n]);
+        S* e_ = &(hE[bid * E_size]);
+
+        T* v_ = (nv > 0) ? load_ptr_batch<T>(V, bid, shiftV, strideV) : nullptr;
+        T* u_ = (nu > 0) ? load_ptr_batch<T>(U, bid, shiftU, strideU) : nullptr;
+        T* c_ = (nc > 0) ? load_ptr_batch<T>(C, bid, shiftC, strideC) : nullptr;
+        S* work_ = &(hwork[0]);
+
+        I info = 0;
+
+        I nru = nu;
+        I ncc = nc;
+
+        // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V
+        // as input variable however, rocsolver bdsqr() accepts variable called "V" and "ldv"
+        // but it is  actually holding "VT"
+        T* vt_ = v_;
+        I ldvt = ldv;
+
+        I nrv = n;
+        I ncvt = nv;
+        bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0);
+
+        bdsqr_single_template<S, T, I>(handle, uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu,
+                                       c_, ldc, work_, info, dwork_, stream);
+
+        if(info == 0)
+        {
+            // explicitly zero out "E" array
+            // to be compatible with rocsolver bdsqr
+            S const zero = S(0);
+            for(I i = 0; i < (n - 1); i++)
+            {
+                e_[i] = zero;
+            }
+        }
+
+        if(linfo_array[bid] == 0)
+        {
+            linfo_array[bid] = info;
+        }
+    } // end for bid
+
+    // -------------------------------------------------
+    // transfer arrays D(:) and E(:) from host to device
+    // -------------------------------------------------
+    if(use_single_copy_for_D)
+    {
+        void* const src = (void*)&(hD[0]);
+        void* const dst = (void*)D;
+        size_t const sizeBytes = sizeof(S) * n * batch_count;
+        hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+        HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+    }
+    else
+    {
+        for(I bid = 0; bid < batch_count; bid++)
+        {
+            void* const src = (void*)&(hD[bid * n]);
+            void* const dst = (void*)(D + bid * strideD);
+            size_t const sizeBytes = sizeof(S) * n;
+            hipMemcpyKind const kind = hipMemcpyHostToDevice;
+            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+        }
+    }
+
+    {
+        for(I bid = 0; bid < batch_count; bid++)
+        {
+            void* const src = (void*)&(hE[bid * E_size]);
+            void* const dst = (void*)&(E[bid * strideE]);
+            size_t const sizeBytes = sizeof(S) * E_size;
+            hipMemcpyKind const kind = hipMemcpyHostToDevice;
+            HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream));
+        }
+    }
+
+    // ------------------------------------------------------
+    // copy linfo_array[] from host to info_array[] on device
+    // ------------------------------------------------------
+    {
+        void* const src = (void*)&(linfo_array[0]);
+        void* const dst = (void*)&(info_array[0]);
+        size_t const nbytes = sizeof(I) * batch_count;
+        hipMemcpyKind const kind = hipMemcpyHostToDevice;
+
+        HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream));
+    }
+
+    HIP_CHECK(hipStreamSynchronize(stream));
+
+    // ----------------------
+    // free allocated storage
+    // ----------------------
+    HIP_CHECK(hipHostFree(hwork));
+    hwork = nullptr;
+    HIP_CHECK(hipHostFree(hD));
+    hD = nullptr;
+    HIP_CHECK(hipHostFree(hE));
+    hE = nullptr;
+    HIP_CHECK(hipFree(dwork_));
+    dwork_ = nullptr;
+    HIP_CHECK(hipHostFree(linfo_array));
+    linfo_array = nullptr;
+
+    return (rocblas_status_success);
+}
+
+ROCSOLVER_END_NAMESPACE
+#undef LASR_MAX_NTHREADS
+#undef CHECK_HIP
diff --git a/library/src/common/rocsolver_handle.cpp b/library/src/common/rocsolver_handle.cpp
new file mode 100644
index 000000000..949d60fd2
--- /dev/null
+++ b/library/src/common/rocsolver_handle.cpp
@@ -0,0 +1,132 @@
+/* **************************************************************************
+ * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#include "rocsolver_handle.hpp"
+#include "rocblas.hpp"
+
+#include <memory>
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+rocblas_status rocsolver_set_alg_mode_impl(rocblas_handle handle,
+                                           const rocsolver_function func,
+                                           const rocsolver_alg_mode mode)
+{
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    std::shared_ptr<void> handle_ptr;
+    ROCBLAS_CHECK(rocblas_internal_get_data_ptr(handle, handle_ptr));
+    rocsolver_handle_data handle_data = (rocsolver_handle_data)handle_ptr.get();
+
+    if(handle_data == nullptr)
+    {
+        handle_ptr = std::make_shared<rocsolver_handle_data_>();
+        handle_data = (rocsolver_handle_data)handle_ptr.get();
+        handle_data->checksum = sizeof(rocsolver_handle_data_);
+
+        ROCBLAS_CHECK(rocblas_internal_set_data_ptr(handle, handle_ptr));
+    }
+    else
+    {
+        if(handle_data->checksum != sizeof(rocsolver_handle_data_))
+            return rocblas_status_internal_error;
+    }
+
+    switch(func)
+    {
+    case rocsolver_function_gesvd:
+    case rocsolver_function_bdsqr:
+        if(mode == rocsolver_alg_mode_gpu || mode == rocsolver_alg_mode_hybrid)
+        {
+            handle_data->bdsqr_mode = mode;
+            return rocblas_status_success;
+        }
+    }
+
+    return rocblas_status_invalid_value;
+}
+
+rocblas_status rocsolver_get_alg_mode_impl(rocblas_handle handle,
+                                           const rocsolver_function func,
+                                           rocsolver_alg_mode* mode)
+{
+    if(!handle)
+        return rocblas_status_invalid_handle;
+
+    std::shared_ptr<void> handle_ptr;
+    ROCBLAS_CHECK(rocblas_internal_get_data_ptr(handle, handle_ptr));
+    rocsolver_handle_data handle_data = (rocsolver_handle_data)handle_ptr.get();
+
+    if(handle_data == nullptr)
+    {
+        *mode = rocsolver_alg_mode_gpu;
+    }
+    else
+    {
+        if(handle_data->checksum != sizeof(rocsolver_handle_data_))
+            return rocblas_status_internal_error;
+
+        switch(func)
+        {
+        case rocsolver_function_gesvd:
+        case rocsolver_function_bdsqr: *mode = handle_data->bdsqr_mode; break;
+        default: return rocblas_status_invalid_value;
+        }
+    }
+
+    return rocblas_status_success;
+}
+
+ROCSOLVER_END_NAMESPACE
+
+extern "C" {
+
+rocblas_status rocsolver_set_alg_mode(rocblas_handle handle,
+                                      const rocsolver_function func,
+                                      const rocsolver_alg_mode mode)
+try
+{
+    return rocsolver::rocsolver_set_alg_mode_impl(handle, func, mode);
+}
+catch(...)
+{
+    return rocsolver::exception_to_rocblas_status();
+}
+
+rocblas_status rocsolver_get_alg_mode(rocblas_handle handle,
+                                      const rocsolver_function func,
+                                      rocsolver_alg_mode* mode)
+try
+{
+    return rocsolver::rocsolver_get_alg_mode_impl(handle, func, mode);
+}
+catch(...)
+{
+    return rocsolver::exception_to_rocblas_status();
+}
+}
diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp
index fe0979246..6e4760c44 100644
--- a/library/src/include/lapack_device_functions.hpp
+++ b/library/src/include/lapack_device_functions.hpp
@@ -1111,4 +1111,81 @@ ROCSOLVER_KERNEL void axpy_kernel(const rocblas_int n,
     }
 }
 
+/** ROT applies a Givens rotation between to vector x y of dimension n.
+    Launch this kernel with a desired number of threads organized in
+    NG groups in the x direction with NT threads in the x direction. **/
+template <typename S, typename T, typename I>
+ROCSOLVER_KERNEL void
+    rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s)
+{
+    if(n <= 0)
+        return;
+
+    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = hipBlockDim_x * hipGridDim_x;
+
+    if((incx == 1) && (incy == 1))
+    {
+        // ------------
+        // special case
+        // ------------
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const temp = c * x[i] + s * y[i];
+            y[i] = c * y[i] - s * x[i];
+            x[i] = temp;
+        }
+    }
+    else
+    {
+        // ---------------------------
+        // code for unequal increments
+        // ---------------------------
+
+        for(auto i = i_start; i < n; i += i_inc)
+        {
+            auto const ix = i * static_cast<int64_t>(incx);
+            auto const iy = i * static_cast<int64_t>(incy);
+            auto const temp = c * x[ix] + s * y[iy];
+            y[iy] = c * y[iy] - s * x[ix];
+            x[ix] = temp;
+        }
+    }
+}
+
+/** SCAL scales a vector x of dimension n by a factor da.
+    Launch this kernel with a desired number of threads organized in
+    NG groups in the x direction with NT threads in the x direction. **/
+template <typename S, typename T, typename I>
+ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const incx)
+{
+    if(n <= 0)
+        return;
+
+    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = hipBlockDim_x * hipGridDim_x;
+
+    S const zero = 0;
+    bool const is_da_zero = (da == zero);
+    if(incx == 1)
+    {
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            x[i] = da * x[i];
+        }
+    }
+    else
+    {
+        // ---------------------------
+        // code for non-unit increments
+        // ---------------------------
+
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const ix = i * static_cast<int64_t>(incx);
+            x[ix] = da * x[ix];
+        }
+    }
+}
+
 ROCSOLVER_END_NAMESPACE
diff --git a/library/src/include/lapack_host_functions.hpp b/library/src/include/lapack_host_functions.hpp
new file mode 100644
index 000000000..c120151ce
--- /dev/null
+++ b/library/src/include/lapack_host_functions.hpp
@@ -0,0 +1,600 @@
+/* **************************************************************************
+ * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include "lib_host_helpers.hpp"
+#include "lib_macros.hpp"
+#include "rocsolver/rocsolver.h"
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+/*
+ * ===========================================================================
+ *    common location for host functions that reproduce LAPACK
+ *    and BLAS functionality.
+ * ===========================================================================
+ */
+
+static void call_lamch(char& cmach, float& eps)
+{
+    switch(cmach)
+    {
+    case 'E':
+    case 'e': eps = std::numeric_limits<float>::epsilon(); return;
+    case 'S':
+    case 's': eps = std::numeric_limits<float>::min(); return;
+    case 'B':
+    case 'b': eps = FLT_RADIX; return;
+    default: eps = std::numeric_limits<float>::min();
+    }
+}
+
+static void call_lamch(char& cmach, double& eps)
+{
+    switch(cmach)
+    {
+    case 'E':
+    case 'e': eps = std::numeric_limits<double>::epsilon(); return;
+    case 'S':
+    case 's': eps = std::numeric_limits<double>::min(); return;
+    case 'B':
+    case 'b': eps = FLT_RADIX; return;
+    default: eps = std::numeric_limits<double>::min();
+    }
+}
+
+template <typename T>
+static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax)
+{
+    T const zero = 0;
+    T const one = 1;
+    T const two = 2;
+
+    T as, at, au, c, fa, fhmn, fhmx, ga, ha;
+
+    auto square = [](auto x) { return (x * x); };
+
+    fa = std::abs(f);
+    ga = std::abs(g);
+    ha = std::abs(h);
+    fhmn = std::min(fa, ha);
+    fhmx = std::max(fa, ha);
+    if(fhmn == zero)
+    {
+        ssmin = zero;
+        if(fhmx == zero)
+        {
+            ssmax = ga;
+        }
+        else
+        {
+            // ssmax = max( fhmx, ga )*sqrt( one+ ( min( fhmx, ga ) / max( fhmx, ga ) )**2 );
+            ssmax = std::max(fhmx, ga)
+                * std::sqrt(one + square(std::min(fhmx, ga) / std::max(fhmx, ga)));
+        }
+    }
+    else
+    {
+        if(ga < fhmx)
+        {
+            as = one + fhmn / fhmx;
+            at = (fhmx - fhmn) / fhmx;
+            au = square(ga / fhmx);
+            c = two / (std::sqrt(as * as + au) + std::sqrt(at * at + au));
+            ssmin = fhmn * c;
+            ssmax = fhmx / c;
+        }
+        else
+        {
+            au = fhmx / ga;
+            if(au == zero)
+            {
+                //
+                //               avoid possible harmful underflow if exponent range
+                //               asymmetric (true ssmin may not underflow even if
+                //               au underflows)
+                //
+                ssmin = (fhmn * fhmx) / ga;
+                ssmax = ga;
+            }
+            else
+            {
+                as = one + fhmn / fhmx;
+                at = (fhmx - fhmn) / fhmx;
+                // c = one / ( sqrt( one+( as*au )**2 )+ sqrt( one+( at*au )**2 ) );
+                c = one / (std::sqrt(one + square(as * au)) + std::sqrt(one + square(at * au)));
+                ssmin = (fhmn * c) * au;
+                ssmin = ssmin + ssmin;
+                ssmax = ga / (c + c);
+            }
+        }
+    }
+}
+
+template <typename T, typename S>
+static void call_lartg(T& f, T& g, S& cs, T& sn, T& r)
+{
+    // ------------------------------------------------------
+    // lartg generates a plane rotation so that
+    // [  cs  sn ] * [ f ] = [ r ]
+    // [ -sn  cs ]   [ g ]   [ 0 ]
+    //
+    // where cs * cs + abs(sn)*abs(sn) == 1
+    // if g == 0, then cs == 1, sn == 0
+    // if f == 0, then cs = 0, sn is chosen so that r is real
+    // ------------------------------------------------------
+
+    auto dble = [](auto z) { return (static_cast<double>(real_part(z))); };
+    auto dimag = [](auto z) { return (static_cast<double>(imag_part(z))); };
+    auto disnan = [](auto x) -> bool { return (isnan(x)); };
+    auto dcmplx = [](auto x, auto y) -> T {
+        bool constexpr is_complex_type = rocblas_is_complex<T>;
+
+        if constexpr(is_complex_type)
+        {
+            return (T(x, y));
+        }
+        else
+        {
+            return (T(x));
+        };
+    };
+    auto dconjg = [&](auto z) { return (dcmplx(dble(z), -dimag(z))); };
+
+    auto square = [](auto x) { return (x * x); };
+
+    auto abs1 = [&](auto ff) { return (std::max(std::abs(dble(ff)), std::abs(dimag(ff)))); };
+    auto abssq = [&](auto ff) { return (square(dble(ff)) + square(dimag(ff))); };
+
+    // -----------------------------------------
+    // compute  sqrt( x * x + y * y )
+    // without unnecessary overflow or underflow
+    // -----------------------------------------
+    auto dlapy2 = [&](auto x, auto y) {
+        auto const one = 1;
+        auto const zero = 0;
+
+        auto ddlapy2 = x;
+        bool const x_is_nan = disnan(x);
+        bool const y_is_nan = disnan(y);
+        if(x_is_nan)
+            ddlapy2 = x;
+        if(y_is_nan)
+            ddlapy2 = y;
+
+        if(!(x_is_nan || y_is_nan))
+        {
+            auto const xabs = std::abs(x);
+            auto const yabs = std::abs(y);
+            auto const w = std::max(xabs, yabs);
+            auto const z = std::min(xabs, yabs);
+            if(z == zero)
+            {
+                ddlapy2 = w;
+            }
+            else
+            {
+                ddlapy2 = w * std::sqrt(one + square(z / w));
+            }
+        }
+        return (ddlapy2);
+    };
+
+    char cmach = 'E';
+    S const zero = 0;
+    S const one = 1;
+    S const two = 2;
+    T const czero = 0;
+
+    bool has_work;
+    bool first;
+    int count, i;
+    S d, di, dr, eps, f2, f2s, g2, g2s, safmin;
+    S safmn2, safmx2, scale;
+    T ff, fs, gs;
+
+    // safmin = dlamch( 's' )
+    cmach = 'S';
+    call_lamch(cmach, safmin);
+
+    // eps = dlamch( 'e' )
+    cmach = 'E';
+    call_lamch(cmach, eps);
+
+    // safmn2 = dlamch( 'b' )**int( log( safmin / eps ) / log( dlamch( 'b' ) ) / two )
+    cmach = 'B';
+    S radix = 2;
+    call_lamch(cmach, radix);
+
+    int const npow = (std::log(safmin / eps) / std::log(radix) / two);
+    safmn2 = std::pow(radix, npow);
+    safmx2 = one / safmn2;
+    scale = std::max(abs1(f), abs1(g));
+    fs = f;
+    gs = g;
+    count = 0;
+
+    if(scale >= safmx2)
+    {
+        do
+        {
+            count = count + 1;
+            fs = fs * safmn2;
+            gs = gs * safmn2;
+            scale = scale * safmn2;
+            has_work = ((scale >= safmx2) && (count < 20));
+        } while(has_work);
+    }
+    else
+    {
+        if(scale <= safmn2)
+        {
+            if((g == czero) || disnan(std::abs(g)))
+            {
+                cs = one;
+                sn = czero;
+                r = f;
+                return;
+            }
+            do
+            {
+                count = count - 1;
+                fs = fs * safmx2;
+                gs = gs * safmx2;
+                scale = scale * safmx2;
+                has_work = (scale <= safmn2);
+            } while(has_work);
+        }
+        f2 = abssq(fs);
+        g2 = abssq(gs);
+        if(f2 <= std::max(g2, one) * safmin)
+        {
+            //
+            //        this is a rare case: f is very small.
+            //
+            if(f == czero)
+            {
+                cs = zero;
+                r = dlapy2(dble(g), dimag(g));
+                //           do complex/real division explicitly with two real divisions
+                d = dlapy2(dble(gs), dimag(gs));
+                sn = dcmplx(dble(gs) / d, -dimag(gs) / d);
+                return;
+            }
+            f2s = dlapy2(dble(fs), dimag(fs));
+            //        g2 and g2s are accurate
+            //        g2 is at least safmin, and g2s is at least safmn2
+            g2s = std::sqrt(g2);
+            //        error in cs from underflow in f2s is at most
+            //        unfl / safmn2  <  sqrt(unfl*eps) .lt. eps
+            //        if max(g2,one)=g2,  then f2  <  g2*safmin,
+            //        and so cs  <  sqrt(safmin)
+            //        if max(g2,one)=one,  then f2  <  safmin
+            //        and so cs  <  sqrt(safmin)/safmn2 = sqrt(eps)
+            //        therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s
+            cs = f2s / g2s;
+            //        make sure abs(ff) = 1
+            //        do complex/real division explicitly with 2 real divisions
+            if(abs1(f) > one)
+            {
+                d = dlapy2(dble(f), dimag(f));
+                ff = dcmplx(dble(f) / d, dimag(f) / d);
+            }
+            else
+            {
+                dr = safmx2 * dble(f);
+                di = safmx2 * dimag(f);
+                d = dlapy2(dr, di);
+                ff = dcmplx(dr / d, di / d);
+            }
+            sn = ff * dcmplx(dble(gs) / g2s, -dimag(gs) / g2s);
+            r = cs * f + sn * g;
+        }
+        else
+        {
+            //
+            //        this is the most common case.
+            //        neither f2 nor f2/g2 are less than safmin
+            //        f2s cannot overflow, and it is accurate
+            //
+            f2s = std::sqrt(one + g2 / f2);
+            //        do the f2s(real)*fs(complex) multiply with two real multiplies
+            r = dcmplx(f2s * dble(fs), f2s * dimag(fs));
+            cs = one / f2s;
+            d = f2 + g2;
+            //        do complex/real division explicitly with two real divisions
+            sn = dcmplx(dble(r) / d, dimag(r) / d);
+            sn = sn * dconjg(gs);
+            if(count != 0)
+            {
+                if(count > 0)
+                {
+                    for(i = 1; i <= count; i++)
+                    {
+                        r = r * safmx2;
+                    };
+                }
+                else
+                {
+                    for(i = 1; i <= -count; i++)
+                    {
+                        r = r * safmn2;
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename T, typename S, typename I>
+static void call_scal(I& n, S& a, T& x_in, I& incx)
+{
+    bool const is_zero = (a == 0);
+    T* const x = &x_in;
+    for(I i = 0; i < n; i++)
+    {
+        auto const ip = i * incx;
+        x[ip] *= a;
+    }
+}
+
+template <typename T, typename S, typename I>
+static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s)
+{
+    T* const x = &(x_in);
+    T* const y = &(y_in);
+
+    for(I i = 0; i < n; i++)
+    {
+        auto const ix = i * incx;
+        auto const iy = i * incy;
+
+        auto const temp = c * x[ix] + s * y[iy];
+        y[iy] = c * y[iy] - s * x[ix];
+        x[ix] = temp;
+    }
+}
+
+// --------------------------------------------------------
+// lasv2 computes the singular value decomposition of a 2 x 2
+// triangular matrix
+// [ F G ]
+// [ 0 H ]
+//
+// on return,
+// abs(ssmax) is the larger singular value,
+// abs(ssmin) is the smaller singular value,
+// (csl,snl) and (csr,snr) are the left and right
+// singular vectors for abs(ssmax)
+//
+// [ csl  snl]  [  F  G ]  [ csr   -snr] = [ ssmax   0    ]
+// [-snl  csl]  [  0  H ]  [ snr    csr]   [  0     ssmin ]
+// --------------------------------------------------------
+template <typename T>
+static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& snl, T& csl)
+{
+    T const zero = 0;
+    T const one = 1;
+    T const two = 2;
+    T const four = 4;
+    T const half = one / two;
+
+    bool gasmal;
+    bool swap;
+    int pmax;
+    char cmach;
+
+    T a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m;
+    T mm, r, s, slt, srt, t, temp, tsign, tt;
+    T macheps;
+
+    auto sign = [](auto a, auto b) {
+        auto const abs_a = std::abs(a);
+        return ((b >= 0) ? abs_a : -abs_a);
+    };
+
+    ft = f;
+    fa = std::abs(ft);
+    ht = h;
+    ha = std::abs(h);
+    //
+    //     pmax points to the maximum absolute element of matrix
+    //       pmax = 1 if f largest in absolute values
+    //       pmax = 2 if g largest in absolute values
+    //       pmax = 3 if h largest in absolute values
+    //
+    pmax = 1;
+    swap = (ha > fa);
+    if(swap)
+    {
+        pmax = 3;
+        temp = ft;
+        ft = ht;
+        ht = temp;
+        temp = fa;
+        fa = ha;
+        ha = temp;
+        //
+        //        now fa >= ha
+        //
+    }
+    gt = g;
+    ga = std::abs(gt);
+    if(ga == zero)
+    {
+        //
+        //        diagonal matrix
+        //
+        ssmin = ha;
+        ssmax = fa;
+        clt = one;
+        crt = one;
+        slt = zero;
+        srt = zero;
+    }
+    else
+    {
+        gasmal = true;
+        if(ga > fa)
+        {
+            pmax = 2;
+
+            cmach = 'E';
+            call_lamch(cmach, macheps);
+
+            if((fa / ga) < macheps)
+            {
+                //
+                //              case of very large ga
+                //
+                gasmal = false;
+                ssmax = ga;
+                if(ha > one)
+                {
+                    ssmin = fa / (ga / ha);
+                }
+                else
+                {
+                    ssmin = (fa / ga) * ha;
+                }
+                clt = one;
+                slt = ht / gt;
+                srt = one;
+                crt = ft / gt;
+            }
+        }
+        if(gasmal)
+        {
+            //
+            //           normal case
+            //
+            d = fa - ha;
+            if(d == fa)
+            {
+                //
+                //              copes with infinite f or h
+                //
+                l = one;
+            }
+            else
+            {
+                l = d / fa;
+            }
+            //
+            //           note that 0  <=  l <= 1
+            //
+            m = gt / ft;
+            //
+            //           note that abs(m)  <=  1/macheps
+            //
+            t = two - l;
+            //
+            //           note that t >= 1
+            //
+            mm = m * m;
+            tt = t * t;
+            s = std::sqrt(tt + mm);
+            //
+            //           note that 1  <=  s <= 1 + 1/macheps
+            //
+            if(l == zero)
+            {
+                r = std::abs(m);
+            }
+            else
+            {
+                r = std::sqrt(l * l + mm);
+            }
+            //
+            //           note that 0  <=  r .le. 1 + 1/macheps
+            //
+            a = half * (s + r);
+            //
+            //           note that 1  <=  a .le. 1 + abs(m)
+            //
+            ssmin = ha / a;
+            ssmax = fa * a;
+            if(mm == zero)
+            {
+                //
+                //              note that m is very tiny
+                //
+                if(l == zero)
+                {
+                    t = sign(two, ft) * sign(one, gt);
+                }
+                else
+                {
+                    t = gt / sign(d, ft) + m / t;
+                }
+            }
+            else
+            {
+                t = (m / (s + t) + m / (r + l)) * (one + a);
+            }
+            l = std::sqrt(t * t + four);
+            crt = two / l;
+            srt = t / l;
+            clt = (crt + srt * m) / a;
+            slt = (ht / ft) * srt / a;
+        }
+    }
+    if(swap)
+    {
+        csl = srt;
+        snl = crt;
+        csr = slt;
+        snr = clt;
+    }
+    else
+    {
+        csl = clt;
+        snl = slt;
+        csr = crt;
+        snr = srt;
+    }
+    //
+    //     correct signs of ssmax and ssmin
+    //
+    if(pmax == 1)
+    {
+        tsign = sign(one, csr) * sign(one, csl) * sign(one, f);
+    }
+    if(pmax == 2)
+    {
+        tsign = sign(one, snr) * sign(one, csl) * sign(one, g);
+    }
+    if(pmax == 3)
+    {
+        tsign = sign(one, snr) * sign(one, snl) * sign(one, h);
+    }
+    ssmax = sign(ssmax, tsign);
+    ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h));
+}
+
+ROCSOLVER_END_NAMESPACE
diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp
index 74f82a15f..dc011f03a 100644
--- a/library/src/include/lib_device_helpers.hpp
+++ b/library/src/include/lib_device_helpers.hpp
@@ -1237,4 +1237,46 @@ __device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev
 #endif
 }
 
+/** SWAP swaps the values of vectors x and y of dimension n.
+    Launch this kernel with a desired number of threads organized in
+    NG groups in the x direction with NT threads in the x direction. **/
+template <typename S, typename T, typename I>
+ROCSOLVER_KERNEL void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy)
+{
+    if(n <= 0)
+        return;
+
+    I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
+    I const i_inc = hipBlockDim_x * hipGridDim_x;
+
+    if((incx == 1) && (incy == 1))
+    {
+        // ------------
+        // special case
+        // ------------
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const temp = y[i];
+            y[i] = x[i];
+            x[i] = temp;
+        }
+    }
+    else
+    {
+        // ---------------------------
+        // code for unequal increments
+        // ---------------------------
+
+        for(I i = i_start; i < n; i += i_inc)
+        {
+            auto const ix = 0 + i * static_cast<int64_t>(incx);
+            auto const iy = 0 + i * static_cast<int64_t>(incy);
+
+            auto const temp = y[iy];
+            y[iy] = x[ix];
+            x[ix] = temp;
+        }
+    }
+}
+
 ROCSOLVER_END_NAMESPACE
diff --git a/library/src/include/lib_host_helpers.hpp b/library/src/include/lib_host_helpers.hpp
index bca4aae41..8dfad5014 100644
--- a/library/src/include/lib_host_helpers.hpp
+++ b/library/src/include/lib_host_helpers.hpp
@@ -92,6 +92,74 @@ I get_index(I* intervals, I max, I dim)
     return i;
 }
 
+template <typename T, typename I>
+static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy)
+{
+    T* const x = &(x_in);
+    T* const y = &(y_in);
+    for(I i = 0; i < n; i++)
+    {
+        auto const ix = i * static_cast<int64_t>(incx);
+        auto const iy = i * static_cast<int64_t>(incy);
+
+        T const temp = x[ix];
+        x[ix] = y[iy];
+        y[iy] = temp;
+    }
+}
+
+static float real_part(float z)
+{
+    return (z);
+}
+static float real_part(std::complex<float> z)
+{
+    return (z.real());
+}
+static float real_part(rocblas_complex_num<float> z)
+{
+    return (z.real());
+}
+
+static double real_part(double z)
+{
+    return (z);
+}
+static double real_part(std::complex<double> z)
+{
+    return (z.real());
+}
+static double real_part(rocblas_complex_num<double> z)
+{
+    return (z.real());
+}
+
+static float imag_part(float z)
+{
+    return (0);
+}
+static float imag_part(std::complex<float> z)
+{
+    return (z.imag());
+}
+static float imag_part(rocblas_complex_num<float> z)
+{
+    return (z.imag());
+}
+
+static double imag_part(double z)
+{
+    return (0);
+}
+static double imag_part(std::complex<double> z)
+{
+    return (z.imag());
+}
+static double imag_part(rocblas_complex_num<double> z)
+{
+    return (z.imag());
+}
+
 #ifdef ROCSOLVER_VERIFY_ASSUMPTIONS
 // Ensure __assert_fail is declared.
 #if !__is_identifier(__assert_fail)
@@ -153,4 +221,12 @@ extern "C" [[noreturn]] void __assert_fail(const char* assertion,
 #define ROCSOLVER_ASSUME_X(invariant, msg) __builtin_assume(invariant)
 #endif
 
+#ifndef CHECK_HIP
+#define CHECK_HIP(fcn)                  \
+    {                                   \
+        hipError_t const istat = (fcn); \
+        assert(istat == hipSuccess);    \
+    }
+#endif
+
 ROCSOLVER_END_NAMESPACE
diff --git a/library/src/include/rocsolver_handle.hpp b/library/src/include/rocsolver_handle.hpp
new file mode 100644
index 000000000..2e4177556
--- /dev/null
+++ b/library/src/include/rocsolver_handle.hpp
@@ -0,0 +1,44 @@
+/* **************************************************************************
+ * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * *************************************************************************/
+
+#pragma once
+
+#include "common_host_helpers.hpp"
+#include "rocsolver/rocsolver.h"
+
+ROCSOLVER_BEGIN_NAMESPACE
+
+struct rocsolver_handle_data_
+{
+    rocblas_int checksum;
+
+    rocsolver_alg_mode bdsqr_mode = rocsolver_alg_mode_gpu;
+};
+
+typedef struct rocsolver_handle_data_* rocsolver_handle_data;
+
+ROCSOLVER_END_NAMESPACE