diff --git a/CHANGELOG.md b/CHANGELOG.md index 45187e998..c919d5722 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](h * Application of a sequence of plane rotations to a given matrix - LASR +* Algorithm selection APIs for hybrid computation +* SVD of bidiagonal matrices routine: + - BDSQR now supports hybrid computation +* SVD of general matrices routine: + - GESVD now supports hybrid computation ### Changed ### Removed diff --git a/CMakeLists.txt b/CMakeLists.txt index 91a80175c..d8e77749f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,7 +204,7 @@ find_package(hip REQUIRED CONFIG PATHS ${ROCM_PATH} /opt/rocm) find_package(rocblas REQUIRED CONFIG PATHS ${ROCM_PATH}) get_imported_target_location(location roc::rocblas) message(STATUS "Found rocBLAS: ${location}") -set(rocblas_minimum 4.1) +set(rocblas_minimum 4.4) rocm_package_add_dependencies(SHARED_DEPENDS "rocblas >= ${rocblas_minimum}") rocm_package_add_rpm_dependencies(STATIC_DEPENDS "rocblas-static-devel >= ${rocblas_minimum}") rocm_package_add_deb_dependencies(STATIC_DEPENDS "rocblas-static-dev >= ${rocblas_minimum}") diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 82669d341..16a244199 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -125,6 +125,13 @@ try " Reported time will be the average.\n" " ") + ("alg_mode", + value(&argus.alg_mode)->default_value(0), + "0 = GPU-only, 1 = Hybrid\n" + " This will change how the algorithm operates.\n" + " Only applicable to functions with hybrid support.\n" + " ") + ("mem_query", value(&argus.mem_query)->default_value(0), "Calculate the required amount of device workspace memory? 0 = No, 1 = Yes.\n" diff --git a/clients/common/auxiliary/testing_bdsqr.hpp b/clients/common/auxiliary/testing_bdsqr.hpp index 2f2a984f4..058a923b4 100644 --- a/clients/common/auxiliary/testing_bdsqr.hpp +++ b/clients/common/auxiliary/testing_bdsqr.hpp @@ -478,6 +478,19 @@ void testing_bdsqr(Arguments& argus) rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; + if(argus.alg_mode) + { + EXPECT_ROCBLAS_STATUS( + rocsolver_set_alg_mode(handle, rocsolver_function_bdsqr, rocsolver_alg_mode_hybrid), + rocblas_status_success); + + rocsolver_alg_mode alg_mode; + EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_bdsqr, &alg_mode), + rocblas_status_success); + + EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); + } + // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { diff --git a/clients/common/lapack/testing_gesvd.hpp b/clients/common/lapack/testing_gesvd.hpp index 2fbf49dcd..de7fa10d3 100644 --- a/clients/common/lapack/testing_gesvd.hpp +++ b/clients/common/lapack/testing_gesvd.hpp @@ -548,6 +548,19 @@ void testing_gesvd(Arguments& argus) rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; + if(argus.alg_mode) + { + EXPECT_ROCBLAS_STATUS( + rocsolver_set_alg_mode(handle, rocsolver_function_gesvd, rocsolver_alg_mode_hybrid), + rocblas_status_success); + + rocsolver_alg_mode alg_mode; + EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_gesvd, &alg_mode), + rocblas_status_success); + + EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); + } + // check non-supported values if(rightv == rocblas_svect_overwrite && leftv == rocblas_svect_overwrite) { diff --git a/clients/common/misc/rocsolver_arguments.hpp b/clients/common/misc/rocsolver_arguments.hpp index cfcb887eb..a8ffd9e73 100644 --- a/clients/common/misc/rocsolver_arguments.hpp +++ b/clients/common/misc/rocsolver_arguments.hpp @@ -54,6 +54,7 @@ class Arguments : private std::map rocblas_int perf = 0; rocblas_int singular = 0; rocblas_int iters = 5; + rocblas_int alg_mode = 0; rocblas_int mem_query = 0; rocblas_int profile = 0; rocblas_int profile_kernels = 0; @@ -112,6 +113,7 @@ class Arguments : private std::map to_consume.erase("batch_count"); to_consume.erase("verify"); to_consume.erase("iters"); + to_consume.erase("alg_mode"); to_consume.erase("mem_query"); to_consume.erase("profile"); to_consume.erase("profile_kernels"); diff --git a/clients/gtest/auxiliary/bdsqr_gtest.cpp b/clients/gtest/auxiliary/bdsqr_gtest.cpp index b963f6029..4e78b0bea 100644 --- a/clients/gtest/auxiliary/bdsqr_gtest.cpp +++ b/clients/gtest/auxiliary/bdsqr_gtest.cpp @@ -112,7 +112,8 @@ Arguments bdsqr_setup_arguments(bdsqr_tuple tup) return arg; } -class BDSQR : public ::TestWithParam +template +class BDSQR_BASE : public ::TestWithParam { protected: void TearDown() override @@ -124,6 +125,7 @@ class BDSQR : public ::TestWithParam void run_tests() { Arguments arg = bdsqr_setup_arguments(GetParam()); + arg.alg_mode = MODE; if(arg.peek("n") == 0 && arg.peek("uplo") == 'L') testing_bdsqr_bad_arg(); @@ -132,6 +134,14 @@ class BDSQR : public ::TestWithParam } }; +class BDSQR : public BDSQR_BASE<0> +{ +}; + +class BDSQR_HYBRID : public BDSQR_BASE<1> +{ +}; + // non-batch tests TEST_P(BDSQR, __float) @@ -154,8 +164,36 @@ TEST_P(BDSQR, __double_complex) run_tests(); } +TEST_P(BDSQR_HYBRID, __float) +{ + run_tests(); +} + +TEST_P(BDSQR_HYBRID, __double) +{ + run_tests(); +} + +TEST_P(BDSQR_HYBRID, __float_complex) +{ + run_tests(); +} + +TEST_P(BDSQR_HYBRID, __double_complex) +{ + run_tests(); +} + INSTANTIATE_TEST_SUITE_P(daily_lapack, BDSQR, Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, BDSQR, Combine(ValuesIn(size_range), ValuesIn(opt_range))); + +INSTANTIATE_TEST_SUITE_P(daily_lapack, + BDSQR_HYBRID, + Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, + BDSQR_HYBRID, + Combine(ValuesIn(size_range), ValuesIn(opt_range))); diff --git a/clients/gtest/lapack/gesvd_gtest.cpp b/clients/gtest/lapack/gesvd_gtest.cpp index 51acceda3..3ee770c1d 100644 --- a/clients/gtest/lapack/gesvd_gtest.cpp +++ b/clients/gtest/lapack/gesvd_gtest.cpp @@ -158,7 +158,8 @@ Arguments gesvd_setup_arguments(gesvd_tuple tup) return arg; } -class GESVD : public ::TestWithParam +template +class GESVD_BASE : public ::TestWithParam { protected: void TearDown() override @@ -170,6 +171,7 @@ class GESVD : public ::TestWithParam void run_tests() { Arguments arg = gesvd_setup_arguments(GetParam()); + arg.alg_mode = MODE; if(arg.peek("m") == 0 && arg.peek("n") == 0 && arg.peek("left_svect") == 'N' && arg.peek("right_svect") == 'N') @@ -180,6 +182,14 @@ class GESVD : public ::TestWithParam } }; +class GESVD : public GESVD_BASE<0> +{ +}; + +class GESVD_HYBRID : public GESVD_BASE<1> +{ +}; + // non-batch tests TEST_P(GESVD, __float) @@ -202,6 +212,26 @@ TEST_P(GESVD, __double_complex) run_tests(); } +TEST_P(GESVD_HYBRID, __float) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, __double) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, __float_complex) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, __double_complex) +{ + run_tests(); +} + // batched tests TEST_P(GESVD, batched__float) @@ -224,6 +254,26 @@ TEST_P(GESVD, batched__double_complex) run_tests(); } +TEST_P(GESVD_HYBRID, batched__float) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, batched__double) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, batched__float_complex) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, batched__double_complex) +{ + run_tests(); +} + // strided_batched tests TEST_P(GESVD, strided_batched__float) @@ -246,11 +296,36 @@ TEST_P(GESVD, strided_batched__double_complex) run_tests(); } -// daily_lapack tests normal execution with medium to large sizes +TEST_P(GESVD_HYBRID, strided_batched__float) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, strided_batched__double) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, strided_batched__float_complex) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, strided_batched__double_complex) +{ + run_tests(); +} + INSTANTIATE_TEST_SUITE_P(daily_lapack, GESVD, Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); -// checkin_lapack tests normal execution with small sizes, invalid sizes, -// quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESVD, Combine(ValuesIn(size_range), ValuesIn(opt_range))); + +INSTANTIATE_TEST_SUITE_P(daily_lapack, + GESVD_HYBRID, + Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, + GESVD_HYBRID, + Combine(ValuesIn(size_range), ValuesIn(opt_range))); diff --git a/docs/index.rst b/docs/index.rst index d7725421c..e1c39a266 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,7 +8,7 @@ rocSOLVER documentation ******************************************************************** -rocSOLVER is an implementation of `LAPACK routines `_ +rocSOLVER is an implementation of `LAPACK routines `_ on top of the :doc:`AMD ROCm platform `. rocSOLVER is implemented in the :doc:`HIP programming language ` and optimized for AMD GPUs. @@ -39,7 +39,7 @@ The rocSOLVER documentation is structured as follows: * :ref:`lapackfunc` * :ref:`lapack-like` * :ref:`refactor` - * :ref:`api_logging` + * :ref:`helpers` * :ref:`tuning_label` * :ref:`deprecated` diff --git a/docs/reference/auxiliary.rst b/docs/reference/auxiliary.rst index 83503e7e2..8bebbf116 100644 --- a/docs/reference/auxiliary.rst +++ b/docs/reference/auxiliary.rst @@ -58,7 +58,7 @@ rocsolver_lacgv() .. doxygenfunction:: rocsolver_zlacgv_64 :outline: .. doxygenfunction:: rocsolver_clacgv_64 - :outline + :outline: .. doxygenfunction:: rocsolver_zlacgv :outline: .. doxygenfunction:: rocsolver_clacgv diff --git a/docs/reference/logging.rst b/docs/reference/helpers.rst similarity index 64% rename from docs/reference/logging.rst rename to docs/reference/helpers.rst index 26c02e42f..f8051f03f 100644 --- a/docs/reference/logging.rst +++ b/docs/reference/helpers.rst @@ -2,16 +2,63 @@ :description: rocSOLVER documentation and API reference library :keywords: rocSOLVER, ROCm, API, documentation -.. _api_logging: +.. _helpers: ***************************************************** -rocSOLVER Logging Functions and Library Information +rocSOLVER Library and Logging Functions ***************************************************** -Logging functions +These are helper functions that retrieve information and control some functions of the library. +The helper functions are divided into the following categories: + +* :ref:`lib_info`. These functions return information about the library version. +* :ref:`algo_select`. Functions to select different algorithm modes of certain APIs. +* :ref:`api_logging`. These functions control the :ref:`logging-label` capabilities. + + + +.. _lib_info: + +Library information +=============================== + +.. contents:: List of library information functions + :local: + :backlinks: top + +rocsolver_get_version_string() +------------------------------------ +.. doxygenfunction:: rocsolver_get_version_string + +rocsolver_get_version_string_size() +------------------------------------ +.. doxygenfunction:: rocsolver_get_version_string_size + + + +.. _algo_select: + +Algorithm selection =============================== -These functions control rocSOLVER's :ref:`logging-label` capabilities. +.. contents:: List of algorithm selection functions + :local: + :backlinks: top + +rocsolver_set_alg_mode() +------------------------------------ +.. doxygenfunction:: rocsolver_set_alg_mode + +rocsolver_get_alg_mode() +------------------------------------ +.. doxygenfunction:: rocsolver_get_alg_mode + + + +.. _api_logging: + +Logging functions +=============================== .. contents:: List of logging functions :local: @@ -45,22 +92,3 @@ rocsolver_log_flush_profile() --------------------------------- .. doxygenfunction:: rocsolver_log_flush_profile - - -.. _libraryinfo: - -Library information -=============================== - -.. contents:: List of library information functions - :local: - :backlinks: top - -rocsolver_get_version_string() ------------------------------------- -.. doxygenfunction:: rocsolver_get_version_string - -rocsolver_get_version_string_size() ------------------------------------- -.. doxygenfunction:: rocsolver_get_version_string_size - diff --git a/docs/reference/index.rst b/docs/reference/index.rst index b82b12dc2..18a2fb338 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -8,15 +8,15 @@ Reference ######################################## -This section provides technical descriptions and important information about -the different rocSOLVER APIs and library components. +This section provides technical descriptions and important information about +the different rocSOLVER APIs and library components. * :ref:`intro` * :ref:`rocsolver-types` -* :ref:`rocsolver_auxiliary_functions` +* :ref:`rocsolver_auxiliary_functions` * :ref:`lapackfunc` * :ref:`lapack-like` * :ref:`refactor` -* :ref:`api_logging` +* :ref:`helpers` * :ref:`tuning_label` * :ref:`deprecated` diff --git a/docs/reference/lapacklike.rst b/docs/reference/lapacklike.rst index b6b2823c5..68b6881f3 100644 --- a/docs/reference/lapacklike.rst +++ b/docs/reference/lapacklike.rst @@ -547,7 +547,7 @@ rocsolver_syevdx_strided_batched() rocsolver_heevdx() --------------------------------------------------- .. doxygenfunction:: rocsolver_zheevdx - :outline + :outline: .. doxygenfunction:: rocsolver_cheevdx rocsolver_heevdx_batched() diff --git a/docs/reference/tuning.rst b/docs/reference/tuning.rst index 82cba73d2..8b59e356a 100644 --- a/docs/reference/tuning.rst +++ b/docs/reference/tuning.rst @@ -180,14 +180,18 @@ GEBRD_GEBD2_SWITCHSIZE bdsqr function ================== -The Singular Value Decomposition of a bidiagonal matrix could be sped up by splitting the matrix into diagonal blocks -and processing those blocks in parallel. +The Singular Value Decomposition of a bidiagonal matrix could be executed with one or multiple thread blocks, and it is +a blocking API that requires synchronization with the host. -BDSQR_SPLIT_GROUPS +BDSQR_SWITCH_SIZE ------------------- -.. doxygendefine:: BDSQR_SPLIT_GROUPS +.. doxygendefine:: BDSQR_SWITCH_SIZE -(As of the current rocSOLVER release, this constant has not been tuned for any specific cases.) +BDSQR_ITERS_PER_SYNC +---------------------- +.. doxygendefine:: BDSQR_ITERS_PER_SYNC + +(As of the current rocSOLVER release, this constants have not been tuned for any specific cases.) diff --git a/docs/reference/types.rst b/docs/reference/types.rst index 6706f0e23..0fb8d716c 100644 --- a/docs/reference/types.rst +++ b/docs/reference/types.rst @@ -70,3 +70,11 @@ rocsolver_rfinfo rocsolver_rfinfo_mode ------------------------ .. doxygenenum:: rocsolver_rfinfo_mode + +rocsolver_alg_mode +------------------------ +.. doxygenenum:: rocsolver_alg_mode + +rocsolver_function +------------------------ +.. doxygenenum:: rocsolver_function diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index e5d3606ce..6ad32dfe3 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -23,7 +23,7 @@ subtrees: - file: reference/lapack.rst - file: reference/lapacklike.rst - file: reference/refact.rst - - file: reference/logging.rst + - file: reference/helpers.rst - file: reference/tuning.rst - file: reference/deprecated.rst - file: license.rst diff --git a/library/include/rocsolver/rocsolver-aliases.h b/library/include/rocsolver/rocsolver-aliases.h index fec1ea401..de938bf22 100644 --- a/library/include/rocsolver/rocsolver-aliases.h +++ b/library/include/rocsolver/rocsolver-aliases.h @@ -1,5 +1,5 @@ /* ************************************************************************** - * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -91,7 +91,7 @@ ROCSOLVER_DEPRECATED_X("use rocblas_fill") typedef rocblas_fill rocsolver_fill; */ ROCSOLVER_DEPRECATED_X("use rocblas_diagonal") typedef rocblas_diagonal rocsolver_diagonal; -/*! \deprecated Use \c rocblas_stide. +/*! \deprecated Use \c rocblas_side. */ ROCSOLVER_DEPRECATED_X("use rocblas_side") typedef rocblas_side rocsolver_side; diff --git a/library/include/rocsolver/rocsolver-extra-types.h b/library/include/rocsolver/rocsolver-extra-types.h index a22d62ce9..3335f8f3d 100644 --- a/library/include/rocsolver/rocsolver-extra-types.h +++ b/library/include/rocsolver/rocsolver-extra-types.h @@ -173,4 +173,21 @@ typedef enum rocblas_pivot_ rocblas_pivot_bottom = 283, /**< The i-th rotation is applied on plane (i,m) or (i,n). */ } rocblas_pivot; +/*! \brief Used by specific functions to specify the algorithm mode. + ********************************************************************************/ +typedef enum rocsolver_alg_mode_ +{ + rocsolver_alg_mode_gpu + = 291, /**< Computations are all performed on the GPU. This is the default mode. */ + rocsolver_alg_mode_hybrid = 292, /**< Computations are performed on the CPU and GPU. */ +} rocsolver_alg_mode; + +/*! \brief Used to specify a function with multiple supported algorithm modes. + ********************************************************************************/ +typedef enum rocsolver_function_ +{ + rocsolver_function_bdsqr = 401, + rocsolver_function_gesvd = 402, +} rocsolver_function; + #endif /* ROCSOLVER_EXTRA_TYPES_H */ diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h index 1964166d8..bd9a0592d 100644 --- a/library/include/rocsolver/rocsolver-functions.h +++ b/library/include/rocsolver/rocsolver-functions.h @@ -141,6 +141,45 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_log_write_profile(void); ROCSOLVER_EXPORT rocblas_status rocsolver_log_flush_profile(void); +/* + * =========================================================================== + * Hybrid algorithm enablement + * =========================================================================== + */ + +/*! \brief SET_ALG_MODE sets the algorithm mode to be used by the specified function. + + @param[in] + handle rocblas_handle. + @param[in] + func #rocsolver_function. + The function that will use the selected algorithm mode. + @param[in] + mode #rocsolver_alg_mode. + The algorithm mode that will be used by the specified function. + *************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_set_alg_mode(rocblas_handle handle, + const rocsolver_function func, + const rocsolver_alg_mode mode); + +/*! \brief GET_ALG_MODE gets the algorithm mode being used by the specified function. + + @param[in] + handle rocblas_handle. + @param[in] + func #rocsolver_function. + The specified function. + @param[out] + mode pointer to #rocsolver_alg_mode. + On exit, the value is overwritten by the algorithm mode used + by the specified function. + *************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_get_alg_mode(rocblas_handle handle, + const rocsolver_function func, + rocsolver_alg_mode* mode); + /* * =========================================================================== * Auxiliary functions @@ -3831,6 +3870,10 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zunmtr(rocblas_handle handle, In order to carry out calculations, this method may synchronize the stream contained within the rocblas_handle. + \note + A hybrid (CPU+GPU) approach is available for BDSQR, primarily intended for homogeneous architectures. + Use \ref rocsolver_set_alg_mode to enable it. + @param[in] handle rocblas_handle. @param[in] @@ -12566,8 +12609,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zpotri_strided_batched(rocblas_handle the "Tuning rocSOLVER performance" and "Memory model" sections of the documentation. \note - In order to carry out calculations, this method may synchronize the stream contained within the - rocblas_handle. + In order to carry out calculations, this method may synchronize the stream contained + within the rocblas_handle. + + \note + A hybrid (CPU+GPU) approach is available for GESVD, primarily intended for homogeneous architectures. + Use \ref rocsolver_set_alg_mode to enable it. @param[in] handle rocblas_handle. @@ -12743,8 +12790,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd(rocblas_handle handle, and "Memory model" sections of the documentation. \note - In order to carry out calculations, this method may synchronize the stream contained within the - rocblas_handle. + In order to carry out calculations, this method may synchronize the stream contained + within the rocblas_handle. + + \note + A hybrid (CPU+GPU) approach is available for GESVD_BATCHED, primarily intended for + homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it. @param[in] handle rocblas_handle. @@ -12961,8 +13012,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd_batched(rocblas_handle handle, and "Memory model" sections of the documentation. \note - In order to carry out calculations, this method may synchronize the stream contained within the - rocblas_handle. + In order to carry out calculations, this method may synchronize the stream contained + within the rocblas_handle. + + \note + A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED, primarily intended + for homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it. @param[in] handle rocblas_handle. diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index 5a6203457..e4033ea06 100755 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -345,6 +345,7 @@ endif() set(auxiliaries common/buildinfo.cpp + common/rocsolver_handle.cpp common/rocsolver_logger.cpp common/rocsparse.cpp ) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp index b89fd297d..c19ae4f9b 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp @@ -38,6 +38,8 @@ #include +#include "rocauxiliary_bdsqr_hybrid.hpp" + ROCSOLVER_BEGIN_NAMESPACE /******************** Device functions *************************/ @@ -1207,6 +1209,9 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle, hipStream_t stream; rocblas_get_stream(handle, &stream); + rocsolver_alg_mode alg_mode; + ROCBLAS_CHECK(rocsolver_get_alg_mode(handle, rocsolver_function_bdsqr, &alg_mode)); + // set tolerance and max number of iterations: // machine precision (considering rounding strategy) S eps = get_epsilon() / 2; @@ -1251,70 +1256,81 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle, if(n > 1) { - // rotate to upper bidiagonal if necessary - if(uplo == rocblas_fill_lower) + if(alg_mode == rocsolver_alg_mode_hybrid) { - ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper), gridBasic, threadsUC, 0, stream, n, nu, - nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C, shiftC, - ldc, strideC, info, work, strideW, completed); + ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template( + handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U, + shiftU, ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work)); } - - rocblas_int h_iter = 0; - struct + else { - rocblas_int completed; - rocblas_int num_splits; - } h_params; + // rotate to upper bidiagonal if necessary + if(uplo == rocblas_fill_lower) + { + ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper), gridBasic, threadsUC, 0, stream, n, + nu, nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C, + shiftC, ldc, strideC, info, work, strideW, completed); + } - while(h_iter < maxiter) - { - // if all instances in the batch have finished, exit the loop - HIP_CHECK(hipMemcpyAsync(&h_params, completed, sizeof(h_params), hipMemcpyDeviceToHost, - stream)); - HIP_CHECK(hipStreamSynchronize(stream)); + rocblas_int h_iter = 0; + struct + { + rocblas_int completed; + rocblas_int num_splits; + } h_params; - if(h_params.completed == batch_count) - break; + while(h_iter < maxiter) + { + // if all instances in the batch have finished, exit the loop + HIP_CHECK(hipMemcpyAsync(&h_params, completed, sizeof(h_params), + hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); - dim3 gridSplits(1, h_params.num_splits, batch_count); - dim3 gridVUC((nvuc_max - 1) / BS1 + 1, h_params.num_splits, batch_count); + if(h_params.completed == batch_count) + break; - for(rocblas_int inner_iters = 0; inner_iters < BDSQR_ITERS_PER_SYNC; inner_iters++) - { - if(nvuc_max <= BDSQR_SWITCH_SIZE) - { - // main computation of SVD - ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute), gridSplits, threadsBS1, 0, - stream, n, nv, nu, nc, D, strideD, E, strideE, V, - shiftV, ldv, strideV, U, shiftU, ldu, strideU, C, - shiftC, ldc, strideC, maxiter, eps, sfm, tol, minshift, - splits_map, work, incW, strideW, completed); - } - else + dim3 gridSplits(1, h_params.num_splits, batch_count); + dim3 gridVUC((nvuc_max - 1) / BS1 + 1, h_params.num_splits, batch_count); + + for(rocblas_int inner_iters = 0; inner_iters < BDSQR_ITERS_PER_SYNC; inner_iters++) { - // main computation of SVD - ROCSOLVER_LAUNCH_KERNEL( - (bdsqr_compute), gridSplits, threadsBS1, 0, stream, n, nv, nu, nc, - D, strideD, E, strideE, (W1) nullptr, shiftV, ldv, strideV, (W2) nullptr, - shiftU, ldu, strideU, (W3) nullptr, shiftC, ldc, strideC, maxiter, eps, sfm, - tol, minshift, splits_map, work, incW, strideW, completed); - - // update singular vectors - ROCSOLVER_LAUNCH_KERNEL((bdsqr_rotate), gridVUC, threadsVUC, 0, stream, n, - nv, nu, nc, V, shiftV, ldv, strideV, U, shiftU, ldu, - strideU, C, shiftC, ldc, strideC, maxiter, splits_map, - work, incW, strideW, completed); + if(nvuc_max <= BDSQR_SWITCH_SIZE) + { + // main computation of SVD + ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute), gridSplits, threadsBS1, 0, + stream, n, nv, nu, nc, D, strideD, E, strideE, V, + shiftV, ldv, strideV, U, shiftU, ldu, strideU, C, + shiftC, ldc, strideC, maxiter, eps, sfm, tol, + minshift, splits_map, work, incW, strideW, completed); + } + else + { + // main computation of SVD + ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute), gridSplits, threadsBS1, 0, + stream, n, nv, nu, nc, D, strideD, E, strideE, + (W1) nullptr, shiftV, ldv, strideV, (W2) nullptr, + shiftU, ldu, strideU, (W3) nullptr, shiftC, ldc, + strideC, maxiter, eps, sfm, tol, minshift, + splits_map, work, incW, strideW, completed); + + // update singular vectors + ROCSOLVER_LAUNCH_KERNEL((bdsqr_rotate), gridVUC, threadsVUC, 0, stream, + n, nv, nu, nc, V, shiftV, ldv, strideV, U, shiftU, + ldu, strideU, C, shiftC, ldc, strideC, maxiter, + splits_map, work, incW, strideW, completed); + } + + // update split block endpoints + ROCSOLVER_LAUNCH_KERNEL((bdsqr_update_endpoints), gridSplits, threadsBasic, + 0, stream, n, E, strideE, splits_map, work, strideW, + completed); } - // update split block endpoints - ROCSOLVER_LAUNCH_KERNEL((bdsqr_update_endpoints), gridSplits, threadsBasic, 0, - stream, n, E, strideE, splits_map, work, strideW, completed); + // check for completion + h_iter += BDSQR_ITERS_PER_SYNC; + ROCSOLVER_LAUNCH_KERNEL((bdsqr_chk_completed), gridBasic, threadsBasic, 0, + stream, n, maxiter, splits_map, work, strideW, completed); } - - // check for completion - h_iter += BDSQR_ITERS_PER_SYNC; - ROCSOLVER_LAUNCH_KERNEL((bdsqr_chk_completed), gridBasic, threadsBasic, 0, stream, n, - maxiter, splits_map, work, strideW, completed); } } diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp new file mode 100644 index 000000000..69584cc01 --- /dev/null +++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp @@ -0,0 +1,1662 @@ + +/**************************************************************************** + * Derived from the BSD3-licensed + * LAPACK routine (version 3.7.1) -- + * Univ. of Tennessee, Univ. of California Berkeley, + * Univ. of Colorado Denver and NAG Ltd.. + * June 2017 + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "lapack_host_functions.hpp" +#include "rocauxiliary_lasr.hpp" + +ROCSOLVER_BEGIN_NAMESPACE + +/************************************************************************************/ +/***************** Kernel launchers *************************************************/ +/************************************************************************************/ + +template +static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((swap_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, x, incx, y, incy); +} + +template +static void + rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((rot_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, x, incx, y, incy, c, s); +} + +template +static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((scal_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, da, x, incx); +} + +/** Call to lasr functionality. + lasr_body can be executed as a host or device function **/ +template +static void call_lasr(rocblas_side& side, + rocblas_pivot& pivot, + rocblas_direct& direct, + I& m, + I& n, + S& c, + S& s, + T& A, + I& lda) +{ + I const tid = 0; + I const i_inc = 1; + + lasr_body(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc); +} + +/************************************************************************************/ +/***************** Main template functions ******************************************/ +/************************************************************************************/ + +template +static void bdsqr_single_template(rocblas_handle handle, + char uplo, + I n, + I ncvt, + I nru, + I ncc, + S* d_, + S* e_, + T* vt_, + I ldvt, + T* u_, + I ldu, + T* c_, + I ldc, + S* work_, + I& info, + S* dwork_ = nullptr, + hipStream_t stream = 0) +{ + // ------------------------------------- + // Lambda expressions used as helpers + // ------------------------------------- + auto call_swap_gpu = [=](I n, T& x, I incx, T& y, I incy) { + swap_template(n, &x, incx, &y, incy, stream); + }; + + auto call_rot_gpu = [=](I n, T& x, I incx, T& y, I incy, S cosl, S sinl) { + rot_template(n, &x, incx, &y, incy, cosl, sinl, stream); + }; + + auto call_scal_gpu + = [=](I n, auto da, T& x, I incx) { scal_template(n, da, &x, incx, stream); }; + + auto call_lasr_gpu_nocopy + = [=](rocblas_side const side, rocblas_pivot const pivot, rocblas_direct const direct, + I const m, I const n, S& dc, S& ds, T& A, I const lda, hipStream_t stream) { + bool const is_left_side = (side == rocblas_side_left); + auto const mn = (is_left_side) ? m : n; + auto const mn_m1 = (mn - 1); + + rocsolver_lasr_template(handle, side, pivot, direct, m, n, &dc, 0, &ds, 0, &A, + 0, lda, 0, I(1)); + }; + + auto call_lasr_gpu = [=](rocblas_side const side, rocblas_pivot const pivot, + rocblas_direct const direct, I const m, I const n, S& c, S& s, T& A, + I const lda, S* const dwork_, hipStream_t stream) { + bool const is_left_side = (side == rocblas_side_left); + auto const mn = (is_left_side) ? m : n; + auto const mn_m1 = (mn - 1); + S* const dc = dwork_; + S* const ds = dwork_ + mn_m1; + CHECK_HIP(hipStreamSynchronize(stream)); + + CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); + CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); + + rocsolver_lasr_template(handle, side, pivot, direct, m, n, dc, 0, ds, 0, &A, 0, lda, + 0, I(1)); + CHECK_HIP(hipStreamSynchronize(stream)); + }; + + auto d = [=](auto i) -> S& { + assert((1 <= i) && (i <= n)); + return (d_[i - 1]); + }; + + auto e = [=](auto i) -> S& { + assert((1 <= i) && (i <= (n - 1))); + return (e_[i - 1]); + }; + + auto work = [=](auto i) -> S& { return (work_[i - 1]); }; + + auto dwork = [=](auto i) -> S& { return (dwork_[i - 1]); }; + + auto c = [=](auto i, auto j) -> T& { + assert((1 <= i) && (i <= nrc) && (nrc <= ldc)); + assert((1 <= j) && (j <= ncc)); + return (c_[idx2D(i - 1, j - 1, ldc)]); + }; + + auto u = [=](auto i, auto j) -> T& { + assert((1 <= i) && (i <= nru) && (nru <= ldu)); + assert((1 <= j) && (j <= ncu)); + return (u_[idx2D(i - 1, j - 1, ldu)]); + }; + + auto vt = [=](auto i, auto j) -> T& { + assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt)); + assert((1 <= j) && (j <= ncvt)); + return (vt_[idx2D(i - 1, j - 1, ldvt)]); + }; + + auto sign = [](auto a, auto b) { + auto const abs_a = std::abs(a); + return ((b >= 0) ? abs_a : -abs_a); + }; + + auto dble = [](auto x) { return (static_cast(x)); }; + // ------------------------------- + + // ---------------- + // Initialization + // ---------------- + bool const use_gpu = (dwork_ != nullptr); + + // Lapack code used O(n^2) algorithm for sorting + // Consider turning off this and rely on + // bdsqr_sort() to perform sorting + bool constexpr need_sort = false; + + S const zero = 0; + S const one = 1; + S negone = -1; + S const hndrd = 100; + S const hndrth = one / hndrd; + S const ten = 10; + S const eight = 8; + S const meight = -one / eight; + I const maxitr = 6; + I ione = 1; + + bool const lower = (uplo == 'L') || (uplo == 'l'); + bool const upper = (uplo == 'U') || (uplo == 'u'); + + //rotate is true if any singular vectors desired, false otherwise + bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0); + + I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0, + maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0; + + I const nrc = n; // number of rows in C matrix + I const nrvt = n; // number of rows in VT matrix + I const ncu = n; // number of columns in U matrix + + S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, h = 0, mu = 0, + oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, sinl = 0, sinr = 0, sll = 0, + smax = 0, smin = 0, sminl = 0, sminoa = 0, sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0; + + bool const need_update_singular_vectors = (nru > 0) || (ncc > 0); + bool constexpr use_lasr_gpu_nocopy = false; + + nm1 = n - 1; + nm12 = nm1 + nm1; + nm13 = nm12 + nm1; + idir = 0; + + //get machine constants + { + char cmach_eps = 'E'; + char cmach_unfl = 'S'; + call_lamch(cmach_eps, eps); + call_lamch(cmach_unfl, unfl); + } + + // ----------------------------------------- + // rotate to upper bidiagonal if necesarry + // ----------------------------------------- + if(lower) + { + for(i = 1; i <= (n - 1); i++) + { + call_lartg(d(i), e(i), cs, sn, r); + d(i) = r; + e(i) = sn * d(i + 1); + d(i + 1) = cs * d(i + 1); + work(i) = cs; + work(nm1 + i) = sn; + } + + // update singular vectors if desired + if(use_lasr_gpu_nocopy) + { + CHECK_HIP(hipStreamSynchronize(stream)); + + if(need_update_singular_vectors) + { + // copy rotations + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + CHECK_HIP(hipStreamSynchronize(stream)); + } + + if(nru > 0) + { + // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu ); + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, n, dwork(1), dwork(n), u(1, 1), + ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu, + dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc ); + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, n, ncc, dwork(1), dwork(n), c(1, 1), + ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc, + dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc); + } + } + } + + // ------------------------------------------------------------- + // Compute singular values and vector to relative accuracy tol + // ------------------------------------------------------------- + tolmul = std::max(ten, std::min(hndrd, pow(eps, meight))); + tol = tolmul * eps; + + // compute approximate maximum, minimum singular values + smax = zero; + for(i = 1; i <= n; i++) + { + smax = std::max(smax, std::abs(d(i))); + } +L20: + for(i = 1; i <= (n - 1); i++) + { + smax = std::max(smax, std::abs(e(i))); + } + + // compute tolerance +L30: + sminl = zero; + if(tol >= zero) + { + // relative accuracy desired + sminoa = std::abs(d(1)); + if(sminoa == zero) + goto L50; + mu = sminoa; + for(i = 2; i <= n; i++) + { + mu = std::abs(d(i)) * (mu / (mu + std::abs(e(i - 1)))); + sminoa = std::min(sminoa, mu); + if(sminoa == zero) + goto L50; + } + L40: + L50: + sminoa = sminoa / std::sqrt(dble(n)); + thresh = std::max(tol * sminoa, ((unfl * n) * n) * maxitr); + } + else + { + //absolute accuracy desired + thresh = std::max(std::abs(tol) * smax, ((unfl * n) * n) * maxitr); + } + + /** prepare for main iteration loop for the singular values + (maxit is the maximum number of passes through the inner + loop permitted before nonconvergence signalled.) **/ + maxitdivn = maxitr * n; + iterdivn = 0; + iter = -1; + oldll = -1; + oldm = -1; + m = n; + + /////////////////////////// + /// MAIN ITERATION LOOP /// + /////////////////////////// +L60: + // check for convergence or exceeding iteration count + if(m <= 1) + goto L160; + + if(iter >= n) + { + iter = iter - n; + iterdivn = iterdivn + 1; + if(iterdivn >= maxitdivn) + goto L200; + } + + // find diagonal block of matrix to work on + if(tol < zero && std::abs(d(m)) <= thresh) + d(m) = zero; + + smax = std::abs(d(m)); + smin = smax; + for(lll = 1; lll <= (m - 1); lll++) + { + ll = m - lll; + abss = std::abs(d(ll)); + abse = std::abs(e(ll)); + if(tol < zero && abss <= thresh) + d(ll) = zero; + if(abse <= thresh) + goto L80; + smin = std::min(smin, abss); + smax = std::max(smax, std::max(abss, abse)); + } + +L70: + ll = 0; + goto L90; + +L80: + e(ll) = zero; + // matrix splits since e(ll) = 0 + if(ll == m - 1) + { + // convergence of bottom singular value, return to top of loop + m = m - 1; + goto L60; + } + +L90: + ll = ll + 1; + // e(ll) through e(m-1) are nonzero, e(ll-1) is zero + if(ll == m - 1) + { + // 2 by 2 block, handle separately + call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl); + d(m - 1) = sigmx; + e(m - 1) = zero; + d(m) = sigmn; + + // compute singular vectors, if desired + if(ncvt > 0) + { + if(use_gpu) + { + call_rot_gpu(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr); + } + else + { + call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr); + } + } + if(nru > 0) + { + if(use_gpu) + { + call_rot_gpu(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl); + } + else + { + call_rot(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl); + } + } + if(ncc > 0) + { + if(use_gpu) + { + call_rot_gpu(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl); + } + else + { + call_rot(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl); + } + } + m = m - 2; + goto L60; + } + + /** if working on new submatrix, choose shift direction + (from larger end diagonal element towards smaller) **/ + if(ll > oldm || m < oldll) + { + if(std::abs(d(ll)) >= std::abs(d(m))) + { + // chase bulge from top (big end) to bottom (small end) + idir = 1; + } + else + { + // chase bulge from bottom (big end) to top (small end) + idir = 2; + } + } + + // apply convergence test + if(idir == 1) + { + // run convergence test in forward direction + // first apply standard test to bottom of matrix + if(std::abs(e(m - 1)) <= std::abs(tol) * std::abs(d(m)) + || (tol < zero && std::abs(e(m - 1)) <= thresh)) + { + e(m - 1) = zero; + goto L60; + } + + if(tol >= zero) + { + // if relative accuracy desired, + // apply convergence criterion forward + mu = std::abs(d(ll)); + sminl = mu; + for(lll = ll; lll <= (m - 1); lll++) + { + if(std::abs(e(lll)) <= tol * mu) + { + e(lll) = zero; + goto L60; + } + mu = std::abs(d(lll + 1)) * (mu / (mu + std::abs(e(lll)))); + sminl = std::min(sminl, mu); + } + } + } + else + { + // run convergence test in backward direction + // first apply standard test to top of matrix + if(std::abs(e(ll)) <= std::abs(tol) * std::abs(d(ll)) + || (tol < zero && std::abs(e(ll)) <= thresh)) + { + e(ll) = zero; + goto L60; + } + + if(tol >= zero) + { + // if relative accuracy desired, + // apply convergence criterion backward + mu = std::abs(d(m)); + sminl = mu; + for(lll = (m - 1); lll >= ll; lll--) + { + if(std::abs(e(lll)) <= tol * mu) + { + e(lll) = zero; + goto L60; + } + mu = std::abs(d(lll)) * (mu / (mu + std::abs(e(lll)))); + sminl = std::min(sminl, mu); + } + } + } + + /** compute shift. first, test if shifting would ruin relative + accuracy, and if so set the shift to zero **/ + oldll = ll; + oldm = m; + if(tol >= zero && n * tol * (sminl / smax) <= std::max(eps, hndrth * tol)) + { + //use a zero shift to avoid loss of relative accuracy + shift = zero; + } + else + { + // compute the shift from 2-by-2 block at end of matrix + if(idir == 1) + { + sll = std::abs(d(ll)); + call_las2(d(m - 1), e(m - 1), d(m), shift, r); + } + else + { + sll = std::abs(d(m)); + call_las2(d(ll), e(ll), d(ll + 1), shift, r); + } + // test if shift negligible, and if so set to zero + if(sll > zero) + { + if((shift / sll) * (shift / sll) < eps) + shift = zero; + } + } + + // increment iteration count + iter = iter + m - ll; + + // if shift = 0, do simplified qr iteration + if(shift == zero) + { + if(idir == 1) + { + // chase bulge from top to bottom + // save cosines and sines for later singular vector updates + cs = one; + oldcs = one; + for(i = ll; i <= (m - 1); i++) + { + auto di_cs = d(i) * cs; + call_lartg(di_cs, e(i), cs, sn, r); + if(i > ll) + e(i - 1) = oldsn * r; + auto oldcs_r = oldcs * r; + auto dip1_sn = d(i + 1) * sn; + call_lartg(oldcs_r, dip1_sn, oldcs, oldsn, d(i)); + work(i - ll + 1) = cs; + work(i - ll + 1 + nm1) = sn; + work(i - ll + 1 + nm12) = oldcs; + work(i - ll + 1 + nm13) = oldsn; + } + + L120: + h = d(m) * cs; + d(m) = h * oldcs; + e(m - 1) = h * oldsn; + + // update singular vectors + if(use_lasr_gpu_nocopy) + { + CHECK_HIP(hipStreamSynchronize(stream)); + + if(rotate) + { + // copy rotations + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + if(ncvt > 0) + { + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + + if((nru > 0) || (ncc > 0)) + { + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + } + + CHECK_HIP(hipStreamSynchronize(stream)); + } + + if(ncvt > 0) + { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(ll, 1 ), ldvt ) + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(1), dwork(n), + vt(ll, 1), ldvt, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), + ldvt, dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt); + } + } + if(nru > 0) + { + // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 ), u( 1, ll ), ldu ) + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(nm12 + 1), + dwork(nm13 + 1), u(1, ll), ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu, dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 ), c( ll, 1 ), ldc ) + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(nm12 + 1), + dwork(nm13 + 1), c(ll, 1), ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc, dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc); + } + } + + // test convergence + if(std::abs(e(m - 1)) <= thresh) + e(m - 1) = zero; + } + else + { + // chase bulge from bottom to top + // save cosines and sines for later singular vector updates + cs = one; + oldcs = one; + for(i = m; i >= (ll + 1); i--) + { + auto di_cs = d(i) * cs; + call_lartg(di_cs, e(i - 1), cs, sn, r); + + if(i < m) + e(i) = oldsn * r; + + auto oldcs_r = oldcs * r; + auto dim1_sn = d(i - 1) * sn; + call_lartg(oldcs_r, dim1_sn, oldcs, oldsn, d(i)); + + work(i - ll) = cs; + work(i - ll + nm1) = -sn; + work(i - ll + nm12) = oldcs; + work(i - ll + nm13) = -oldsn; + } + + L130: + h = d(ll) * cs; + d(ll) = h * oldcs; + e(ll) = h * oldsn; + + // update singular vectors + if(use_lasr_gpu_nocopy) + { + CHECK_HIP(hipStreamSynchronize(stream)); + + if(rotate) + { + // copy rotations + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + if((nru > 0) || (ncc > 0)) + { + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + + if(ncvt > 0) + { + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + } + + CHECK_HIP(hipStreamSynchronize(stream)); + } + + if(ncvt > 0) + { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( nm13+1 ), vt( ll, 1 ), ldvt ); + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(nm12 + 1), + dwork(nm13 + 1), vt(ll, 1), ldvt, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt, dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt); + } + } + if(nru > 0) + { + // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, ll ), ldu ) + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(1), dwork(n), + u(1, ll), ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu, + dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, 1 ), ldc ) + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(1), dwork(n), + c(ll, 1), ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc, + dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc); + } + } + + // test convergence + if(std::abs(e(ll)) <= thresh) + e(ll) = zero; + } + } + + // otherwise use nonzero shift + else + { + if(idir == 1) + { + // chase bulge from top to bottom + // save cosines and sines for later singular vector updates + f = (std::abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll)); + g = e(ll); + for(i = ll; i <= (m - 1); i++) + { + call_lartg(f, g, cosr, sinr, r); + if(i > ll) + e(i - 1) = r; + f = cosr * d(i) + sinr * e(i); + e(i) = cosr * e(i) - sinr * d(i); + g = sinr * d(i + 1); + d(i + 1) = cosr * d(i + 1); + call_lartg(f, g, cosl, sinl, r); + d(i) = r; + f = cosl * e(i) + sinl * d(i + 1); + d(i + 1) = cosl * d(i + 1) - sinl * e(i); + if(i < m - 1) + { + g = sinl * e(i + 1); + e(i + 1) = cosl * e(i + 1); + } + work(i - ll + 1) = cosr; + work(i - ll + 1 + nm1) = sinr; + work(i - ll + 1 + nm12) = cosl; + work(i - ll + 1 + nm13) = sinl; + } + + L140: + e(m - 1) = f; + + // update singular vectors + if(use_lasr_gpu_nocopy) + { + CHECK_HIP(hipStreamSynchronize(stream)); + + if(rotate) + { + // copy rotations + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + if(ncvt > 0) + { + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + + if((nru > 0) || (ncc > 0)) + { + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + } + + CHECK_HIP(hipStreamSynchronize(stream)); + } + + if(ncvt > 0) + { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(ll, 1 ), ldvt ) + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(1), dwork(n), + vt(ll, 1), ldvt, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), + ldvt, dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt); + } + } + + if(nru > 0) + { + // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 ), u( 1, ll ), ldu ) + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(nm12 + 1), + dwork(nm13 + 1), u(1, ll), ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu, dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 ), c( ll, 1 ), ldc ) + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(nm12 + 1), + dwork(nm13 + 1), c(ll, 1), ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc, dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc); + } + } + + // test convergence + if(std::abs(e(m - 1)) <= thresh) + e(m - 1) = zero; + } + else + { + // chase bulge from bottom to top + // save cosines and sines for later singular vector updates + f = (std::abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m)); + g = e(m - 1); + for(i = m; i >= (ll + 1); i--) + { + call_lartg(f, g, cosr, sinr, r); + if(i < m) + e(i) = r; + f = cosr * d(i) + sinr * e(i - 1); + e(i - 1) = cosr * e(i - 1) - sinr * d(i); + g = sinr * d(i - 1); + d(i - 1) = cosr * d(i - 1); + call_lartg(f, g, cosl, sinl, r); + d(i) = r; + f = cosl * e(i - 1) + sinl * d(i - 1); + d(i - 1) = cosl * d(i - 1) - sinl * e(i - 1); + if(i > ll + 1) + { + g = sinl * e(i - 2); + e(i - 2) = cosl * e(i - 2); + } + work(i - ll) = cosr; + work(i - ll + nm1) = -sinr; + work(i - ll + nm12) = cosl; + work(i - ll + nm13) = -sinl; + } + + L150: + e(ll) = f; + + // test convergence + if(std::abs(e(ll)) <= thresh) + e(ll) = zero; + + // update singular vectors + if(use_lasr_gpu_nocopy) + { + CHECK_HIP(hipStreamSynchronize(stream)); + + if(rotate) + { + // copy rotations + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + if((nru > 0) || (ncc > 0)) + { + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + + if(ncvt > 0) + { + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + } + + CHECK_HIP(hipStreamSynchronize(stream)); + } + + if(ncvt > 0) + { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(nm13+1), vt( ll, 1 ), ldvt ) + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(nm12 + 1), + dwork(nm13 + 1), vt(ll, 1), ldvt, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt, dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt); + } + } + if(nru > 0) + { + // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, ll ), ldu ) + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(1), dwork(n), + u(1, ll), ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu, + dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, 1 ), ldc ) + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; + auto mm = m - ll + 1; + if(use_gpu) + { + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(1), dwork(n), + c(ll, 1), ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc, + dwork_, stream); + } + } + else + { + call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc); + } + } + } + } + CHECK_HIP(hipStreamSynchronize(stream)); + + // qr iteration finished, go back and check convergence + goto L60; + +L160: + // all singular values converged, so make them positive + for(i = 1; i <= n; i++) + { + if(d(i) < zero) + { + d(i) = -d(i); + if(ncvt > 0) + { + if(use_gpu) + { + call_scal_gpu(ncvt, negone, vt(i, 1), ldvt); + } + else + { + call_scal(ncvt, negone, vt(i, 1), ldvt); + } + } + } + } + +L170: + // sort the singular values into decreasing order (insertion sort on + // singular values, but only one transposition per singular vector) + if(need_sort) + { + for(i = 1; i <= (n - 1); i++) + { + // scan for smallest d(i) + isub = 1; + smin = d(1); + for(j = 2; j <= (n + 1 - i); j++) + { + if(d(j) <= smin) + { + isub = j; + smin = d(j); + } + } + L180: + if(isub != n + 1 - i) + { + // swap singular values and vectors + d(isub) = d(n + 1 - i); + d(n + 1 - i) = smin; + if(ncvt > 0) + { + if(use_gpu) + { + call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + } + else + { + call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + } + } + if(nru > 0) + { + if(use_gpu) + { + call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione); + } + else + { + call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione); + } + } + if(ncc > 0) + { + if(use_gpu) + { + call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + } + else + { + call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + } + } + } + } + } + +L190: + goto L220; + +// maximum number of iterations exceeded, failure to converge +L200: + info = 0; + for(i = 1; i <= (n - 1); i++) + { + if(e(i) != zero) + info = info + 1; + } + +L210: +L220: + return; +} + +template +rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, + const rocblas_fill uplo_in, + const I n, + const I nv, + const I nu, + const I nc, + S* D, + const rocblas_stride strideD, + S* E, + const rocblas_stride strideE, + W1 V_arg, + const I shiftV, + const I ldv, + const rocblas_stride strideV, + W2 U_arg, + const I shiftU, + const I ldu, + const rocblas_stride strideU, + W3 C_arg, + const I shiftC, + const I ldc, + const rocblas_stride strideC, + I* info_array, + const I batch_count, + I* splits_map, + S* work) +{ + // ------------------------------- + // lambda expression as helper + // ------------------------------- + auto is_device_pointer = [](void* ptr) -> bool { + hipPointerAttribute_t dev_attributes; + if(ptr == nullptr) + { + return (false); + } + + auto istat = hipPointerGetAttributes(&dev_attributes, ptr); + if(istat != hipSuccess) + { + std::cout << "is_device_pointer: istat = " << istat << " " << hipGetErrorName(istat) + << std::endl; + } + assert(istat == hipSuccess); + return (dev_attributes.type == hipMemoryTypeDevice); + }; + + // ------------------------- + // copy D into hD, E into hE + // ------------------------- + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + W1 V = V_arg; + W2 U = U_arg; + W3 C = C_arg; + + // handle batch case with array of pointers on device + std::vector Vp_array(batch_count); + std::vector Up_array(batch_count); + std::vector Cp_array(batch_count); + + if(nv > 0) + { + bool const is_device_V_arg = is_device_pointer((void*)V_arg); + if(is_device_V_arg) + { + // note "T *" and "T * const" may be considered different types + bool constexpr is_array_of_device_pointers + = !(std::is_same::value || std::is_same::value); + bool constexpr need_copy_W1 = is_array_of_device_pointers; + if constexpr(need_copy_W1) + { + size_t const nbytes = sizeof(T*) * batch_count; + void* const dst = (void*)&(Vp_array[0]); + void* const src = (void*)V_arg; + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + V = &(Vp_array[0]); + } + } + } + + if(nu > 0) + { + bool const is_device_U_arg = is_device_pointer((void*)U_arg); + if(is_device_U_arg) + { + bool constexpr is_array_of_device_pointers + = !(std::is_same::value || std::is_same::value); + bool constexpr need_copy_W2 = is_array_of_device_pointers; + if constexpr(need_copy_W2) + { + size_t const nbytes = sizeof(T*) * batch_count; + void* const dst = (void*)&(Up_array[0]); + void* const src = (void*)U_arg; + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + U = &(Up_array[0]); + } + } + } + + if(nc > 0) + { + bool const is_device_C_arg = is_device_pointer((void*)C_arg); + if(is_device_C_arg) + { + bool constexpr is_array_of_device_pointers + = !(std::is_same::value || std::is_same::value); + bool constexpr need_copy_W3 = is_array_of_device_pointers; + if constexpr(need_copy_W3) + { + size_t const nbytes = sizeof(T*) * batch_count; + void* const dst = (void*)&(Cp_array[0]); + void* const src = (void*)C_arg; + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + C = &(Cp_array[0]); + } + } + } + + S* hD = nullptr; + S* hE = nullptr; + + size_t const E_size = (n - 1); + HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n)); + HIP_CHECK(hipHostMalloc(&hE, (sizeof(S) * std::max(1, batch_count)) * E_size)); + + // ---------------------------------------------------- + // copy info_array[] on device to linfo_array[] on host + // ---------------------------------------------------- + I* linfo_array = nullptr; + HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * std::max(1, batch_count))); + { + void* const dst = &(linfo_array[0]); + void* const src = &(info_array[0]); + size_t const nbytes = sizeof(I) * batch_count; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + S* hwork = nullptr; + HIP_CHECK(hipHostMalloc(&hwork, sizeof(S) * (4 * n))); + + // ------------------------------------------------- + // transfer arrays D(:) and E(:) from Device to Host + // ------------------------------------------------- + bool const use_single_copy_for_D = (batch_count == 1) || (strideD == n); + if(use_single_copy_for_D) + { + void* const dst = (void*)&(hD[0]); + void* const src = (void*)&(D[0]); + size_t const sizeBytes = sizeof(S) * n * batch_count; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + else + { + for(I bid = 0; bid < batch_count; bid++) + { + void* const dst = (void*)&(hD[bid * n]); + void* const src = (void*)&(D[bid * strideD]); + size_t const sizeBytes = sizeof(S) * n; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + } + + { + for(I bid = 0; bid < batch_count; bid++) + { + void* const dst = (void*)&(hE[bid * E_size]); + void* const src = (void*)&(E[bid * strideE]); + size_t const sizeBytes = sizeof(S) * E_size; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + } + + // ------------------------------------------------- + // Execute for each instance in the batch + // ------------------------------------------------- + HIP_CHECK(hipStreamSynchronize(stream)); + + S* dwork_ = nullptr; + HIP_CHECK(hipMalloc(&dwork_, sizeof(S) * (4 * n))); + + for(I bid = 0; bid < batch_count; bid++) + { + if(linfo_array[bid] != 0) + { + continue; + } + + char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U'; + S* d_ = &(hD[bid * n]); + S* e_ = &(hE[bid * E_size]); + + T* v_ = (nv > 0) ? load_ptr_batch(V, bid, shiftV, strideV) : nullptr; + T* u_ = (nu > 0) ? load_ptr_batch(U, bid, shiftU, strideU) : nullptr; + T* c_ = (nc > 0) ? load_ptr_batch(C, bid, shiftC, strideC) : nullptr; + S* work_ = &(hwork[0]); + + I info = 0; + + I nru = nu; + I ncc = nc; + + // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V + // as input variable however, rocsolver bdsqr() accepts variable called "V" and "ldv" + // but it is actually holding "VT" + T* vt_ = v_; + I ldvt = ldv; + + I nrv = n; + I ncvt = nv; + bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0); + + bdsqr_single_template(handle, uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu, + c_, ldc, work_, info, dwork_, stream); + + if(info == 0) + { + // explicitly zero out "E" array + // to be compatible with rocsolver bdsqr + S const zero = S(0); + for(I i = 0; i < (n - 1); i++) + { + e_[i] = zero; + } + } + + if(linfo_array[bid] == 0) + { + linfo_array[bid] = info; + } + } // end for bid + + // ------------------------------------------------- + // transfer arrays D(:) and E(:) from host to device + // ------------------------------------------------- + if(use_single_copy_for_D) + { + void* const src = (void*)&(hD[0]); + void* const dst = (void*)D; + size_t const sizeBytes = sizeof(S) * n * batch_count; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + else + { + for(I bid = 0; bid < batch_count; bid++) + { + void* const src = (void*)&(hD[bid * n]); + void* const dst = (void*)(D + bid * strideD); + size_t const sizeBytes = sizeof(S) * n; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + } + + { + for(I bid = 0; bid < batch_count; bid++) + { + void* const src = (void*)&(hE[bid * E_size]); + void* const dst = (void*)&(E[bid * strideE]); + size_t const sizeBytes = sizeof(S) * E_size; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + } + + // ------------------------------------------------------ + // copy linfo_array[] from host to info_array[] on device + // ------------------------------------------------------ + { + void* const src = (void*)&(linfo_array[0]); + void* const dst = (void*)&(info_array[0]); + size_t const nbytes = sizeof(I) * batch_count; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + HIP_CHECK(hipStreamSynchronize(stream)); + + // ---------------------- + // free allocated storage + // ---------------------- + HIP_CHECK(hipHostFree(hwork)); + hwork = nullptr; + HIP_CHECK(hipHostFree(hD)); + hD = nullptr; + HIP_CHECK(hipHostFree(hE)); + hE = nullptr; + HIP_CHECK(hipFree(dwork_)); + dwork_ = nullptr; + HIP_CHECK(hipHostFree(linfo_array)); + linfo_array = nullptr; + + return (rocblas_status_success); +} + +ROCSOLVER_END_NAMESPACE +#undef LASR_MAX_NTHREADS +#undef CHECK_HIP diff --git a/library/src/common/rocsolver_handle.cpp b/library/src/common/rocsolver_handle.cpp new file mode 100644 index 000000000..949d60fd2 --- /dev/null +++ b/library/src/common/rocsolver_handle.cpp @@ -0,0 +1,132 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "rocsolver_handle.hpp" +#include "rocblas.hpp" + +#include + +ROCSOLVER_BEGIN_NAMESPACE + +rocblas_status rocsolver_set_alg_mode_impl(rocblas_handle handle, + const rocsolver_function func, + const rocsolver_alg_mode mode) +{ + if(!handle) + return rocblas_status_invalid_handle; + + std::shared_ptr handle_ptr; + ROCBLAS_CHECK(rocblas_internal_get_data_ptr(handle, handle_ptr)); + rocsolver_handle_data handle_data = (rocsolver_handle_data)handle_ptr.get(); + + if(handle_data == nullptr) + { + handle_ptr = std::make_shared(); + handle_data = (rocsolver_handle_data)handle_ptr.get(); + handle_data->checksum = sizeof(rocsolver_handle_data_); + + ROCBLAS_CHECK(rocblas_internal_set_data_ptr(handle, handle_ptr)); + } + else + { + if(handle_data->checksum != sizeof(rocsolver_handle_data_)) + return rocblas_status_internal_error; + } + + switch(func) + { + case rocsolver_function_gesvd: + case rocsolver_function_bdsqr: + if(mode == rocsolver_alg_mode_gpu || mode == rocsolver_alg_mode_hybrid) + { + handle_data->bdsqr_mode = mode; + return rocblas_status_success; + } + } + + return rocblas_status_invalid_value; +} + +rocblas_status rocsolver_get_alg_mode_impl(rocblas_handle handle, + const rocsolver_function func, + rocsolver_alg_mode* mode) +{ + if(!handle) + return rocblas_status_invalid_handle; + + std::shared_ptr handle_ptr; + ROCBLAS_CHECK(rocblas_internal_get_data_ptr(handle, handle_ptr)); + rocsolver_handle_data handle_data = (rocsolver_handle_data)handle_ptr.get(); + + if(handle_data == nullptr) + { + *mode = rocsolver_alg_mode_gpu; + } + else + { + if(handle_data->checksum != sizeof(rocsolver_handle_data_)) + return rocblas_status_internal_error; + + switch(func) + { + case rocsolver_function_gesvd: + case rocsolver_function_bdsqr: *mode = handle_data->bdsqr_mode; break; + default: return rocblas_status_invalid_value; + } + } + + return rocblas_status_success; +} + +ROCSOLVER_END_NAMESPACE + +extern "C" { + +rocblas_status rocsolver_set_alg_mode(rocblas_handle handle, + const rocsolver_function func, + const rocsolver_alg_mode mode) +try +{ + return rocsolver::rocsolver_set_alg_mode_impl(handle, func, mode); +} +catch(...) +{ + return rocsolver::exception_to_rocblas_status(); +} + +rocblas_status rocsolver_get_alg_mode(rocblas_handle handle, + const rocsolver_function func, + rocsolver_alg_mode* mode) +try +{ + return rocsolver::rocsolver_get_alg_mode_impl(handle, func, mode); +} +catch(...) +{ + return rocsolver::exception_to_rocblas_status(); +} +} diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp index fe0979246..6e4760c44 100644 --- a/library/src/include/lapack_device_functions.hpp +++ b/library/src/include/lapack_device_functions.hpp @@ -1111,4 +1111,81 @@ ROCSOLVER_KERNEL void axpy_kernel(const rocblas_int n, } } +/** ROT applies a Givens rotation between to vector x y of dimension n. + Launch this kernel with a desired number of threads organized in + NG groups in the x direction with NT threads in the x direction. **/ +template +ROCSOLVER_KERNEL void + rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s) +{ + if(n <= 0) + return; + + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; + + if((incx == 1) && (incy == 1)) + { + // ------------ + // special case + // ------------ + for(I i = i_start; i < n; i += i_inc) + { + auto const temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } + } + else + { + // --------------------------- + // code for unequal increments + // --------------------------- + + for(auto i = i_start; i < n; i += i_inc) + { + auto const ix = i * static_cast(incx); + auto const iy = i * static_cast(incy); + auto const temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + } + } +} + +/** SCAL scales a vector x of dimension n by a factor da. + Launch this kernel with a desired number of threads organized in + NG groups in the x direction with NT threads in the x direction. **/ +template +ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const incx) +{ + if(n <= 0) + return; + + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; + + S const zero = 0; + bool const is_da_zero = (da == zero); + if(incx == 1) + { + for(I i = i_start; i < n; i += i_inc) + { + x[i] = da * x[i]; + } + } + else + { + // --------------------------- + // code for non-unit increments + // --------------------------- + + for(I i = i_start; i < n; i += i_inc) + { + auto const ix = i * static_cast(incx); + x[ix] = da * x[ix]; + } + } +} + ROCSOLVER_END_NAMESPACE diff --git a/library/src/include/lapack_host_functions.hpp b/library/src/include/lapack_host_functions.hpp new file mode 100644 index 000000000..c120151ce --- /dev/null +++ b/library/src/include/lapack_host_functions.hpp @@ -0,0 +1,600 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "lib_host_helpers.hpp" +#include "lib_macros.hpp" +#include "rocsolver/rocsolver.h" + +ROCSOLVER_BEGIN_NAMESPACE + +/* + * =========================================================================== + * common location for host functions that reproduce LAPACK + * and BLAS functionality. + * =========================================================================== + */ + +static void call_lamch(char& cmach, float& eps) +{ + switch(cmach) + { + case 'E': + case 'e': eps = std::numeric_limits::epsilon(); return; + case 'S': + case 's': eps = std::numeric_limits::min(); return; + case 'B': + case 'b': eps = FLT_RADIX; return; + default: eps = std::numeric_limits::min(); + } +} + +static void call_lamch(char& cmach, double& eps) +{ + switch(cmach) + { + case 'E': + case 'e': eps = std::numeric_limits::epsilon(); return; + case 'S': + case 's': eps = std::numeric_limits::min(); return; + case 'B': + case 'b': eps = FLT_RADIX; return; + default: eps = std::numeric_limits::min(); + } +} + +template +static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax) +{ + T const zero = 0; + T const one = 1; + T const two = 2; + + T as, at, au, c, fa, fhmn, fhmx, ga, ha; + + auto square = [](auto x) { return (x * x); }; + + fa = std::abs(f); + ga = std::abs(g); + ha = std::abs(h); + fhmn = std::min(fa, ha); + fhmx = std::max(fa, ha); + if(fhmn == zero) + { + ssmin = zero; + if(fhmx == zero) + { + ssmax = ga; + } + else + { + // ssmax = max( fhmx, ga )*sqrt( one+ ( min( fhmx, ga ) / max( fhmx, ga ) )**2 ); + ssmax = std::max(fhmx, ga) + * std::sqrt(one + square(std::min(fhmx, ga) / std::max(fhmx, ga))); + } + } + else + { + if(ga < fhmx) + { + as = one + fhmn / fhmx; + at = (fhmx - fhmn) / fhmx; + au = square(ga / fhmx); + c = two / (std::sqrt(as * as + au) + std::sqrt(at * at + au)); + ssmin = fhmn * c; + ssmax = fhmx / c; + } + else + { + au = fhmx / ga; + if(au == zero) + { + // + // avoid possible harmful underflow if exponent range + // asymmetric (true ssmin may not underflow even if + // au underflows) + // + ssmin = (fhmn * fhmx) / ga; + ssmax = ga; + } + else + { + as = one + fhmn / fhmx; + at = (fhmx - fhmn) / fhmx; + // c = one / ( sqrt( one+( as*au )**2 )+ sqrt( one+( at*au )**2 ) ); + c = one / (std::sqrt(one + square(as * au)) + std::sqrt(one + square(at * au))); + ssmin = (fhmn * c) * au; + ssmin = ssmin + ssmin; + ssmax = ga / (c + c); + } + } + } +} + +template +static void call_lartg(T& f, T& g, S& cs, T& sn, T& r) +{ + // ------------------------------------------------------ + // lartg generates a plane rotation so that + // [ cs sn ] * [ f ] = [ r ] + // [ -sn cs ] [ g ] [ 0 ] + // + // where cs * cs + abs(sn)*abs(sn) == 1 + // if g == 0, then cs == 1, sn == 0 + // if f == 0, then cs = 0, sn is chosen so that r is real + // ------------------------------------------------------ + + auto dble = [](auto z) { return (static_cast(real_part(z))); }; + auto dimag = [](auto z) { return (static_cast(imag_part(z))); }; + auto disnan = [](auto x) -> bool { return (isnan(x)); }; + auto dcmplx = [](auto x, auto y) -> T { + bool constexpr is_complex_type = rocblas_is_complex; + + if constexpr(is_complex_type) + { + return (T(x, y)); + } + else + { + return (T(x)); + }; + }; + auto dconjg = [&](auto z) { return (dcmplx(dble(z), -dimag(z))); }; + + auto square = [](auto x) { return (x * x); }; + + auto abs1 = [&](auto ff) { return (std::max(std::abs(dble(ff)), std::abs(dimag(ff)))); }; + auto abssq = [&](auto ff) { return (square(dble(ff)) + square(dimag(ff))); }; + + // ----------------------------------------- + // compute sqrt( x * x + y * y ) + // without unnecessary overflow or underflow + // ----------------------------------------- + auto dlapy2 = [&](auto x, auto y) { + auto const one = 1; + auto const zero = 0; + + auto ddlapy2 = x; + bool const x_is_nan = disnan(x); + bool const y_is_nan = disnan(y); + if(x_is_nan) + ddlapy2 = x; + if(y_is_nan) + ddlapy2 = y; + + if(!(x_is_nan || y_is_nan)) + { + auto const xabs = std::abs(x); + auto const yabs = std::abs(y); + auto const w = std::max(xabs, yabs); + auto const z = std::min(xabs, yabs); + if(z == zero) + { + ddlapy2 = w; + } + else + { + ddlapy2 = w * std::sqrt(one + square(z / w)); + } + } + return (ddlapy2); + }; + + char cmach = 'E'; + S const zero = 0; + S const one = 1; + S const two = 2; + T const czero = 0; + + bool has_work; + bool first; + int count, i; + S d, di, dr, eps, f2, f2s, g2, g2s, safmin; + S safmn2, safmx2, scale; + T ff, fs, gs; + + // safmin = dlamch( 's' ) + cmach = 'S'; + call_lamch(cmach, safmin); + + // eps = dlamch( 'e' ) + cmach = 'E'; + call_lamch(cmach, eps); + + // safmn2 = dlamch( 'b' )**int( log( safmin / eps ) / log( dlamch( 'b' ) ) / two ) + cmach = 'B'; + S radix = 2; + call_lamch(cmach, radix); + + int const npow = (std::log(safmin / eps) / std::log(radix) / two); + safmn2 = std::pow(radix, npow); + safmx2 = one / safmn2; + scale = std::max(abs1(f), abs1(g)); + fs = f; + gs = g; + count = 0; + + if(scale >= safmx2) + { + do + { + count = count + 1; + fs = fs * safmn2; + gs = gs * safmn2; + scale = scale * safmn2; + has_work = ((scale >= safmx2) && (count < 20)); + } while(has_work); + } + else + { + if(scale <= safmn2) + { + if((g == czero) || disnan(std::abs(g))) + { + cs = one; + sn = czero; + r = f; + return; + } + do + { + count = count - 1; + fs = fs * safmx2; + gs = gs * safmx2; + scale = scale * safmx2; + has_work = (scale <= safmn2); + } while(has_work); + } + f2 = abssq(fs); + g2 = abssq(gs); + if(f2 <= std::max(g2, one) * safmin) + { + // + // this is a rare case: f is very small. + // + if(f == czero) + { + cs = zero; + r = dlapy2(dble(g), dimag(g)); + // do complex/real division explicitly with two real divisions + d = dlapy2(dble(gs), dimag(gs)); + sn = dcmplx(dble(gs) / d, -dimag(gs) / d); + return; + } + f2s = dlapy2(dble(fs), dimag(fs)); + // g2 and g2s are accurate + // g2 is at least safmin, and g2s is at least safmn2 + g2s = std::sqrt(g2); + // error in cs from underflow in f2s is at most + // unfl / safmn2 < sqrt(unfl*eps) .lt. eps + // if max(g2,one)=g2, then f2 < g2*safmin, + // and so cs < sqrt(safmin) + // if max(g2,one)=one, then f2 < safmin + // and so cs < sqrt(safmin)/safmn2 = sqrt(eps) + // therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s + cs = f2s / g2s; + // make sure abs(ff) = 1 + // do complex/real division explicitly with 2 real divisions + if(abs1(f) > one) + { + d = dlapy2(dble(f), dimag(f)); + ff = dcmplx(dble(f) / d, dimag(f) / d); + } + else + { + dr = safmx2 * dble(f); + di = safmx2 * dimag(f); + d = dlapy2(dr, di); + ff = dcmplx(dr / d, di / d); + } + sn = ff * dcmplx(dble(gs) / g2s, -dimag(gs) / g2s); + r = cs * f + sn * g; + } + else + { + // + // this is the most common case. + // neither f2 nor f2/g2 are less than safmin + // f2s cannot overflow, and it is accurate + // + f2s = std::sqrt(one + g2 / f2); + // do the f2s(real)*fs(complex) multiply with two real multiplies + r = dcmplx(f2s * dble(fs), f2s * dimag(fs)); + cs = one / f2s; + d = f2 + g2; + // do complex/real division explicitly with two real divisions + sn = dcmplx(dble(r) / d, dimag(r) / d); + sn = sn * dconjg(gs); + if(count != 0) + { + if(count > 0) + { + for(i = 1; i <= count; i++) + { + r = r * safmx2; + }; + } + else + { + for(i = 1; i <= -count; i++) + { + r = r * safmn2; + } + } + } + } + } +} + +template +static void call_scal(I& n, S& a, T& x_in, I& incx) +{ + bool const is_zero = (a == 0); + T* const x = &x_in; + for(I i = 0; i < n; i++) + { + auto const ip = i * incx; + x[ip] *= a; + } +} + +template +static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s) +{ + T* const x = &(x_in); + T* const y = &(y_in); + + for(I i = 0; i < n; i++) + { + auto const ix = i * incx; + auto const iy = i * incy; + + auto const temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + } +} + +// -------------------------------------------------------- +// lasv2 computes the singular value decomposition of a 2 x 2 +// triangular matrix +// [ F G ] +// [ 0 H ] +// +// on return, +// abs(ssmax) is the larger singular value, +// abs(ssmin) is the smaller singular value, +// (csl,snl) and (csr,snr) are the left and right +// singular vectors for abs(ssmax) +// +// [ csl snl] [ F G ] [ csr -snr] = [ ssmax 0 ] +// [-snl csl] [ 0 H ] [ snr csr] [ 0 ssmin ] +// -------------------------------------------------------- +template +static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& snl, T& csl) +{ + T const zero = 0; + T const one = 1; + T const two = 2; + T const four = 4; + T const half = one / two; + + bool gasmal; + bool swap; + int pmax; + char cmach; + + T a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m; + T mm, r, s, slt, srt, t, temp, tsign, tt; + T macheps; + + auto sign = [](auto a, auto b) { + auto const abs_a = std::abs(a); + return ((b >= 0) ? abs_a : -abs_a); + }; + + ft = f; + fa = std::abs(ft); + ht = h; + ha = std::abs(h); + // + // pmax points to the maximum absolute element of matrix + // pmax = 1 if f largest in absolute values + // pmax = 2 if g largest in absolute values + // pmax = 3 if h largest in absolute values + // + pmax = 1; + swap = (ha > fa); + if(swap) + { + pmax = 3; + temp = ft; + ft = ht; + ht = temp; + temp = fa; + fa = ha; + ha = temp; + // + // now fa >= ha + // + } + gt = g; + ga = std::abs(gt); + if(ga == zero) + { + // + // diagonal matrix + // + ssmin = ha; + ssmax = fa; + clt = one; + crt = one; + slt = zero; + srt = zero; + } + else + { + gasmal = true; + if(ga > fa) + { + pmax = 2; + + cmach = 'E'; + call_lamch(cmach, macheps); + + if((fa / ga) < macheps) + { + // + // case of very large ga + // + gasmal = false; + ssmax = ga; + if(ha > one) + { + ssmin = fa / (ga / ha); + } + else + { + ssmin = (fa / ga) * ha; + } + clt = one; + slt = ht / gt; + srt = one; + crt = ft / gt; + } + } + if(gasmal) + { + // + // normal case + // + d = fa - ha; + if(d == fa) + { + // + // copes with infinite f or h + // + l = one; + } + else + { + l = d / fa; + } + // + // note that 0 <= l <= 1 + // + m = gt / ft; + // + // note that abs(m) <= 1/macheps + // + t = two - l; + // + // note that t >= 1 + // + mm = m * m; + tt = t * t; + s = std::sqrt(tt + mm); + // + // note that 1 <= s <= 1 + 1/macheps + // + if(l == zero) + { + r = std::abs(m); + } + else + { + r = std::sqrt(l * l + mm); + } + // + // note that 0 <= r .le. 1 + 1/macheps + // + a = half * (s + r); + // + // note that 1 <= a .le. 1 + abs(m) + // + ssmin = ha / a; + ssmax = fa * a; + if(mm == zero) + { + // + // note that m is very tiny + // + if(l == zero) + { + t = sign(two, ft) * sign(one, gt); + } + else + { + t = gt / sign(d, ft) + m / t; + } + } + else + { + t = (m / (s + t) + m / (r + l)) * (one + a); + } + l = std::sqrt(t * t + four); + crt = two / l; + srt = t / l; + clt = (crt + srt * m) / a; + slt = (ht / ft) * srt / a; + } + } + if(swap) + { + csl = srt; + snl = crt; + csr = slt; + snr = clt; + } + else + { + csl = clt; + snl = slt; + csr = crt; + snr = srt; + } + // + // correct signs of ssmax and ssmin + // + if(pmax == 1) + { + tsign = sign(one, csr) * sign(one, csl) * sign(one, f); + } + if(pmax == 2) + { + tsign = sign(one, snr) * sign(one, csl) * sign(one, g); + } + if(pmax == 3) + { + tsign = sign(one, snr) * sign(one, snl) * sign(one, h); + } + ssmax = sign(ssmax, tsign); + ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h)); +} + +ROCSOLVER_END_NAMESPACE diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp index 74f82a15f..dc011f03a 100644 --- a/library/src/include/lib_device_helpers.hpp +++ b/library/src/include/lib_device_helpers.hpp @@ -1237,4 +1237,46 @@ __device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev #endif } +/** SWAP swaps the values of vectors x and y of dimension n. + Launch this kernel with a desired number of threads organized in + NG groups in the x direction with NT threads in the x direction. **/ +template +ROCSOLVER_KERNEL void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy) +{ + if(n <= 0) + return; + + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; + + if((incx == 1) && (incy == 1)) + { + // ------------ + // special case + // ------------ + for(I i = i_start; i < n; i += i_inc) + { + auto const temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + else + { + // --------------------------- + // code for unequal increments + // --------------------------- + + for(I i = i_start; i < n; i += i_inc) + { + auto const ix = 0 + i * static_cast(incx); + auto const iy = 0 + i * static_cast(incy); + + auto const temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + } + } +} + ROCSOLVER_END_NAMESPACE diff --git a/library/src/include/lib_host_helpers.hpp b/library/src/include/lib_host_helpers.hpp index bca4aae41..8dfad5014 100644 --- a/library/src/include/lib_host_helpers.hpp +++ b/library/src/include/lib_host_helpers.hpp @@ -92,6 +92,74 @@ I get_index(I* intervals, I max, I dim) return i; } +template +static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy) +{ + T* const x = &(x_in); + T* const y = &(y_in); + for(I i = 0; i < n; i++) + { + auto const ix = i * static_cast(incx); + auto const iy = i * static_cast(incy); + + T const temp = x[ix]; + x[ix] = y[iy]; + y[iy] = temp; + } +} + +static float real_part(float z) +{ + return (z); +} +static float real_part(std::complex z) +{ + return (z.real()); +} +static float real_part(rocblas_complex_num z) +{ + return (z.real()); +} + +static double real_part(double z) +{ + return (z); +} +static double real_part(std::complex z) +{ + return (z.real()); +} +static double real_part(rocblas_complex_num z) +{ + return (z.real()); +} + +static float imag_part(float z) +{ + return (0); +} +static float imag_part(std::complex z) +{ + return (z.imag()); +} +static float imag_part(rocblas_complex_num z) +{ + return (z.imag()); +} + +static double imag_part(double z) +{ + return (0); +} +static double imag_part(std::complex z) +{ + return (z.imag()); +} +static double imag_part(rocblas_complex_num z) +{ + return (z.imag()); +} + #ifdef ROCSOLVER_VERIFY_ASSUMPTIONS // Ensure __assert_fail is declared. #if !__is_identifier(__assert_fail) @@ -153,4 +221,12 @@ extern "C" [[noreturn]] void __assert_fail(const char* assertion, #define ROCSOLVER_ASSUME_X(invariant, msg) __builtin_assume(invariant) #endif +#ifndef CHECK_HIP +#define CHECK_HIP(fcn) \ + { \ + hipError_t const istat = (fcn); \ + assert(istat == hipSuccess); \ + } +#endif + ROCSOLVER_END_NAMESPACE diff --git a/library/src/include/rocsolver_handle.hpp b/library/src/include/rocsolver_handle.hpp new file mode 100644 index 000000000..2e4177556 --- /dev/null +++ b/library/src/include/rocsolver_handle.hpp @@ -0,0 +1,44 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "common_host_helpers.hpp" +#include "rocsolver/rocsolver.h" + +ROCSOLVER_BEGIN_NAMESPACE + +struct rocsolver_handle_data_ +{ + rocblas_int checksum; + + rocsolver_alg_mode bdsqr_mode = rocsolver_alg_mode_gpu; +}; + +typedef struct rocsolver_handle_data_* rocsolver_handle_data; + +ROCSOLVER_END_NAMESPACE