From 1b4ffbce530b534b4205ed9bcf4a5f140b866862 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Tue, 28 May 2024 19:33:43 -0400 Subject: [PATCH 01/35] snapshot --- library/src/auxiliary/bdsqr_host.hpp | 2077 ++++++++++++++++++++++++++ 1 file changed, 2077 insertions(+) create mode 100644 library/src/auxiliary/bdsqr_host.hpp diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp new file mode 100644 index 000000000..d40be74ab --- /dev/null +++ b/library/src/auxiliary/bdsqr_host.hpp @@ -0,0 +1,2077 @@ +#define USE_VT + +/**************************************************************************** + * Derived from the BSD3-licensed + * LAPACK routine (version 3.7.1) -- + * Univ. of Tennessee, Univ. of California Berkeley, + * Univ. of Colorado Denver and NAG Ltd.. + * June 2017 + * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +#include "hip/hip_runtime.h" +#include "hip/hip_runtime_api.h" + + +ROCSOLVER_BEGIN_NAMESPACE + +#ifndef HIP_CHECK +#define HIP_CHECK(fcn) \ + { \ + hipError_t const istat = (fcn); \ + assert(istat == hipSuccess); \ + } +#endif + + +template +__global__ static void lasr_kernel(char const side, char const pivot, + char const direct, I const m, I const n, + S const *const c_, S const *const s_, + T *const A_, I const lda) { + + const auto nblocks = hipGridDim_x; + const auto nthreads_per_block = hipBlockDim_x; + const auto nthreads = nblocks * nthreads_per_block; + const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + const auto i_inc = nthreads; + const auto ij_nb = nthreads; + const auto ij_start = tid; + + auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; + auto min = [](auto x, auto y) { return ((x < y) ? x : y); }; + + auto indx2f = [](auto i, auto j, auto lda) -> int64_t { + assert((1 <= i)); + assert((1 <= lda)); + assert((1 <= j)); + + return ((i - 1) + (j - 1) * int64_t(lda)); + }; + + auto indx1f = [](auto i) -> int64_t { + assert((1 <= i)); + return (i - int64_t(1)); + }; + + auto c = [&](auto i) -> const S & { return (c_[indx1f(i)]); }; + auto s = [&](auto i) -> const S & { return (s_[indx1f(i)]); }; + auto A = [&](auto i, auto j) -> T & { return (A_[indx2f(i, j, lda)]); }; + + const S one = 1; + const S zero = 0; + + // ---------------- + // check arguments + // ---------------- + + const bool is_side_Left = (side == 'L') || (side == 'l'); + const bool is_side_Right = (side == 'R') || (side == 'r'); + + const bool is_pivot_Variable = (pivot == 'V') || (pivot == 'v'); + const bool is_pivot_Bottom = (pivot == 'B') || (pivot == 'b'); + const bool is_pivot_Top = (pivot == 'T') || (pivot == 't'); + + const bool is_direct_Forward = (direct == 'F') || (direct == 'f'); + const bool is_direct_Backward = (direct == 'B') || (direct == 'b'); + + { + const bool isok_side = is_side_Left || is_side_Right; + const bool isok_pivot = + is_pivot_Variable || is_pivot_Bottom || is_pivot_Top; + const bool isok_direct = is_direct_Forward || is_direct_Backward; + + const I info = (!isok_side) ? 1 + : (!isok_pivot) ? 2 + : (!isok_direct) ? 3 + : (m < 0) ? 4 + : (n < 0) ? 5 + : (c_ == nullptr) ? 6 + : (s_ == nullptr) ? 7 + : (A_ == nullptr) ? 8 + : (lda < max(1, m)) ? 9 + : 0; + if (info != 0) + return; + }; + + { + const bool has_work = (m >= 1) && (n >= 1); + if (!has_work) { + return; + }; + }; + + if (is_side_Left && is_pivot_Variable && is_direct_Forward) { + // ----------------------------- + // A := P*A + // Variable pivot, the plane (k,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + { + + for (I j = 1; j <= (m - 1); j++) { + const auto ctemp = c(j); + const auto stemp = s(j); + if ((ctemp != one) || (stemp != zero)) { + for (I i = 1 + tid; i <= n; i += i_inc) { + const auto temp = A(j + 1, i); + A(j + 1, i) = ctemp * temp - stemp * A(j, i); + A(j, i) = stemp * temp + ctemp * A(j, i); + } + }; + }; + }; + + return; + }; + + if (is_side_Left && is_pivot_Variable && is_direct_Backward) { + // ----------------------------- + // A := P*A + // Variable pivot, the plane (k,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + + auto const jend = (m - 1); + auto const jstart = 1; + auto const istart = 1; + auto const iend = n; + + for (I j = jend; j >= jstart; j--) { + const auto ctemp = c(j); + const auto stemp = s(j); + if ((ctemp != one) || (stemp != zero)) { + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(j + 1, i); + A(j + 1, i) = ctemp * temp - stemp * A(j, i); + A(j, i) = stemp * temp + ctemp * A(j, i); + }; + }; + }; + + return; + }; + + if (is_side_Left && is_pivot_Top && is_direct_Forward) { + // ----------------------------- + // A := P*A + // Top pivot, the plane (1,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + { + for (I j = 2; j <= m; j++) { + const auto ctemp = c(j - 1); + const auto stemp = s(j - 1); + for (I i = 1 + tid; i <= n; i += i_inc) { + const auto temp = A(j, i); + A(j, i) = ctemp * temp - stemp * A(1, i); + A(1, i) = stemp * temp + ctemp * A(1, i); + }; + }; + }; + + return; + }; + + if (is_side_Left && is_pivot_Top && is_direct_Backward) { + // ----------------------------- + // A := P*A + // Top pivot, the plane (1,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + { + + auto const jend = m; + auto const jstart = 2; + auto const istart = 1; + auto const iend = n; + + for (I j = jend; j >= jstart; j--) { + const auto ctemp = c(j - 1); + const auto stemp = s(j - 1); + if ((ctemp != one) || (stemp != zero)) { + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(j, i); + + A(j, i) = ctemp * temp - stemp * A(1, i); + A(1, i) = stemp * temp + ctemp * A(1, i); + }; + }; + }; + } + + return; + }; + + if (is_side_Left && is_pivot_Bottom && is_direct_Forward) { + // ----------------------------- + // A := P*A + // Bottom pivot, the plane (k,z) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + { + + auto const jstart = 1; + auto const jend = (m - 1); + auto const istart = 1; + auto const iend = n; + + for (I j = jstart; j <= jend; j++) { + const auto ctemp = c(j); + const auto stemp = s(j); + if ((ctemp != one) || (stemp != zero)) { + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(j, i); + A(j, i) = stemp * A(m, i) + ctemp * temp; + A(m, i) = ctemp * A(m, i) - stemp * temp; + }; + }; + }; + } + + return; + }; + + if (is_side_Left && is_pivot_Bottom && is_direct_Backward) { + // ----------------------------- + // A := P*A + // Bottom pivot, the plane (k,z) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + { + + auto const jend = (m - 1); + auto const jstart = 1; + auto const istart = 1; + auto const iend = n; + + for (I j = jend; j >= jstart; j--) { + const auto ctemp = c(j); + const auto stemp = s(j); + if ((ctemp != one) || (stemp != zero)) { + + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(j, i); + A(j, i) = stemp * A(m, i) + ctemp * temp; + A(m, i) = ctemp * A(m, i) - stemp * temp; + }; + }; + }; + } + + return; + }; + + if (is_side_Right && is_pivot_Variable && is_direct_Forward) { + // ----------------------------- + // A := A*P**T + // Variable pivot, the plane (k,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + + { + + auto const jstart = 1; + auto const jend = (n - 1); + auto const istart = 1; + auto const iend = m; + + for (I j = jstart; j <= jend; j++) { + const auto ctemp = c(j); + const auto stemp = s(j); + if ((ctemp != one) || (stemp != zero)) { + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(i, j + 1); + A(i, j + 1) = ctemp * temp - stemp * A(i, j); + A(i, j) = stemp * temp + ctemp * A(i, j); + }; + }; + }; + } + + return; + }; + + if (is_side_Right && is_pivot_Variable && is_direct_Backward) { + // ----------------------------- + // A := A*P**T + // Variable pivot, the plane (k,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + + { + + auto const jend = (n - 1); + auto const jstart = 1; + auto const istart = 1; + auto const iend = m; + + for (I j = jend; j >= jstart; j--) { + const auto ctemp = c(j); + const auto stemp = s(j); + if ((ctemp != one) || (stemp != zero)) { + + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(i, j + 1); + A(i, j + 1) = ctemp * temp - stemp * A(i, j); + A(i, j) = stemp * temp + ctemp * A(i, j); + }; + }; + }; + } + return; + }; + + if (is_side_Right && is_pivot_Top && is_direct_Forward) { + // ----------------------------- + // A := A*P**T + // Top pivot, the plane (1,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + + { + + auto const jstart = 2; + auto const jend = n; + auto const istart = 1; + auto const iend = m; + + for (I j = jstart; j <= jend; j++) { + const auto ctemp = c(j - 1); + const auto stemp = s(j - 1); + if ((ctemp != one) || (stemp != zero)) { + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(i, j); + + A(i, j) = ctemp * temp - stemp * A(i, 1); + A(i, 1) = stemp * temp + ctemp * A(i, 1); + }; + }; + }; + } + + return; + }; + + if (is_side_Right && is_pivot_Top && is_direct_Backward) { + // ----------------------------- + // A := A*P**T + // Top pivot, the plane (1,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + + { + + auto const jend = n; + auto const jstart = 2; + auto const istart = 1; + auto const iend = m; + + for (I j = jend; j >= jstart; j--) { + const auto ctemp = c(j - 1); + const auto stemp = s(j - 1); + if ((ctemp != one) || (stemp != zero)) { + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(i, j); + + A(i, j) = ctemp * temp - stemp * A(i, 1); + A(i, 1) = stemp * temp + ctemp * A(i, 1); + }; + }; + }; + } + + return; + }; + + if (is_side_Right && is_pivot_Bottom && is_direct_Forward) { + // ----------------------------- + // A := A*P**T + // Bottom pivot, the plane (k,z) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + + { + + auto const jstart = 1; + auto const jend = (n - 1); + auto const istart = 1; + auto const iend = m; + + for (I j = jstart; j <= jend; j++) { + const auto ctemp = c(j); + const auto stemp = s(j); + if ((ctemp != one) || (stemp != zero)) { + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(i, j); + + A(i, j) = stemp * A(i, n) + ctemp * temp; + A(i, n) = ctemp * A(i, n) - stemp * temp; + }; + }; + }; + } + + return; + }; + + if (is_side_Right && is_pivot_Bottom && is_direct_Backward) { + // ----------------------------- + // A := A*P**T + // Bottom pivot, the plane (k,z) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + + { + + auto const jend = (n - 1); + auto const jstart = 1; + auto const istart = 1; + auto const iend = m; + + for (I j = jend; j >= jstart; j--) { + const auto ctemp = c(j); + const auto stemp = s(j); + if ((ctemp != one) || (stemp != zero)) { + for (I i = istart + tid; i <= iend; i += i_inc) { + const auto temp = A(i, j); + A(i, j) = stemp * A(i, n) + ctemp * temp; + A(i, n) = ctemp * A(i, n) - stemp * temp; + }; + }; + }; + } + + return; + }; + + return; +} +template +static void lasr_template_gpu(char const side, char const pivot, + char const direct, I const m, I const n, + S const *const c_, S const *const s_, T *const A_, + I const lda, hipStream_t stream = 0) { + + auto const nthreads = 2 * warpSize; + + bool const is_left_side = (side == 'L') || (side == 'l'); + auto const mn = (is_left_side) ? n : m; + + auto const nblocks = (mn - 1) / nthreads + 1; + hipLaunchKernelGGL((lasr_kernel), dim3(nblocks, 1, 1), + dim3(nthreads, 1, 1), 0, stream, side, pivot, direct, m, n, + c_, s_, A_, lda); +} + +template +__global__ static void rot_kernel(I const n, T *const x, I const incx, + T *const y, I const incy, S const c, + S const s) { + if (n <= 0) + return; + + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; + + if ((incx == 1) && (incy == 1)) { + // ------------ + // special case + // ------------ + for (I i = i_start; i < n; i += i_inc) { + auto const temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } + } else { + // --------------------------- + // code for unequal increments + // --------------------------- + + for (auto i = i_start; i < n; i += i_inc) { + auto const ix = 0 + i * static_cast(incx); + auto const iy = 0 + i * static_cast(incy); + auto const temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + } + } +} + +template +static void rot_template(I const n, T *x, I const incx, T *y, I const incy, + S const c, S const s, hipStream_t stream) { + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((rot_kernel), dim3(nblocks, 1, 1), + dim3(nthreads, 1, 1), 0, stream, n, x, incx, y, incy, c, + s); +} + +template +__global__ static void scal_kernel(I const n, S const da, T *const x, + I const incx) { + + if (n <= 0) + return; + + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; + + S const zero = 0; + bool const is_da_zero = (da == zero); + if (incx == 1) { + for (I i = i_start; i < n; i += i_inc) { + x[i] = (is_da_zero) ? zero : da * x[i]; + } + } else { + // --------------------------- + // code for non-unit increments + // --------------------------- + + for (I i = i_start; i < n; i += i_inc) { + auto const ix = 0 + i * static_cast(incx); + x[ix] = (is_da_zero) ? zero : da * x[ix]; + } + } +} + +template +static void scal_template(I const n, S const da, T *const x, I const incx, + hipStream_t stream) { + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((scal_kernel), dim3(nblocks, 1, 1), + dim3(nthreads, 1, 1), 0, stream, n, da, x, incx); +} + +template +__global__ static void swap_kernel(I const n, T *const x, I const incx, + T *const y, I const incy) { + if (n <= 0) + return; + + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; + + if ((incx == 1) && (incy == 1)) { + // ------------ + // special case + // ------------ + for (I i = i_start; i < n; i += i_inc) { + auto const temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } else { + // --------------------------- + // code for unequal increments + // --------------------------- + + for (I i = i_start; i < n; i += i_inc) { + auto const ix = 0 + i * static_cast(incx); + auto const iy = 0 + i * static_cast(incy); + + auto const temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + } + } +} + +template +static void swap_template(I const n, T *x, I const incx, T *y, I const incy, + hipStream_t stream) { + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((swap_kernel), dim3(nblocks, 1, 1), + dim3(nthreads, 1, 1), 0, stream, n, x, incx, y, incy); +} + +extern "C" { + +double dlamch_(char *cmach); +float slamch_(char *cmach); + +void zswap_(int *n, std::complex *zx, int *incx, + std::complex *zy, int *incy); + +void cswap_(int *n, std::complex *zx, int *incx, std::complex *zy, + int *incy); + +void dswap_(int *n, double *zx, int *incx, double *zy, int *incy); + +void sswap_(int *n, float *zx, int *incx, float *zy, int *incy); + +void dlasq1_(int *n, double *D_, double *E_, double *rwork_, int *info_arg); +void slasq1_(int *n, float *D_, float *E_, float *rwork_, int *info_arg); + +void zlasr_(char *side, char *pivot, char *direct, int *m, int *n, double *c, + double *s, std::complex *A, int *lda); +void clasr_(char *side, char *pivot, char *direct, int *m, int *n, float *c, + float *s, std::complex *A, int *lda); +void slasr_(char *side, char *pivot, char *direct, int *m, int *n, float *c, + float *s, float *A, int *lda); +void dlasr_(char *side, char *pivot, char *direct, int *m, int *n, double *c, + double *s, double *A, int *lda); + +void dlasv2_(double *f, double *g, double *h, double *ssmin, double *ssmax, + double *snr, double *csr, double *snl, double *csl); +void slasv2_(float *f, float *g, float *h, float *ssmin, float *ssmax, + float *snr, float *csr, float *snl, float *csl); + +void zdrot_(int *n, std::complex *zx, int *incx, + std::complex *zy, int *incy, double *c, double *s); + +void csrot_(int *n, std::complex *zx, int *incx, std::complex *zy, + int *incy, float *c, float *s); + +void drot_(int *n, double *dx, int *incx, double *dy, int *incy, double *c, + double *s); + +void srot_(int *n, float *dx, int *incx, float *dy, int *incy, float *c, + float *s); + +void zdscal_(int *n, double *da, std::complex *zx, int *incx); +void csscal_(int *n, float *da, std::complex *zx, int *incx); +void dscal_(int *n, double *da, double *zx, int *incx); +void sscal_(int *n, float *da, float *zx, int *incx); + +void dlartg_(double *f, double *g, double *c, double *s, double *r); +void slartg_(float *f, float *g, float *c, float *s, float *r); + +void dlas2_(double *f, double *g, double *h, double *ssmin, double *ssmax); +void slas2_(float *f, float *g, float *h, float *ssmin, float *ssmax); +}; + +static void call_lamch(char &cmach_arg, double &eps) { + char cmach = cmach_arg; + eps = dlamch_(&cmach); +} + +static void call_lamch(char &cmach_arg, float &eps) { + char cmach = cmach_arg; + eps = slamch_(&cmach); +} + +static void call_swap(int &n, std::complex &zx, int &incx, + std::complex &zy, int &incy) { + zswap_(&n, &zx, &incx, &zy, &incy); +} + +static void call_swap(int &n, std::complex &zx, int &incx, + std::complex &zy, int &incy) { + cswap_(&n, &zx, &incx, &zy, &incy); +} + +static void call_swap(int &n, float &zx, int &incx, float &zy, int &incy) { + sswap_(&n, &zx, &incx, &zy, &incy); +} + +static void call_swap(int &n, double &zx, int &incx, double &zy, int &incy) { + dswap_(&n, &zx, &incx, &zy, &incy); +} + +static void call_las2(double &f, double &g, double &h, double &ssmin, + double &ssmax) { + dlas2_(&f, &g, &h, &ssmin, &ssmax); +} + +static void call_las2(float &f, float &g, float &h, float &ssmin, + float &ssmax) { + slas2_(&f, &g, &h, &ssmin, &ssmax); +} + +static void call_lartg(double &f, double &g, double &c, double &s, double &r) { + dlartg_(&f, &g, &c, &s, &r); +} + +static void call_lartg(float &f, float &g, float &c, float &s, float &r) { + slartg_(&f, &g, &c, &s, &r); +} + +static void call_scal(int &n, double &da, std::complex &zx, int &incx) { + zdscal_(&n, &da, &zx, &incx); +} + +static void call_scal(int &n, float &da, std::complex &zx, int &incx) { + csscal_(&n, &da, &zx, &incx); +} + +static void call_scal(int &n, double &da, double &zx, int &incx) { + dscal_(&n, &da, &zx, &incx); +} + +static void call_scal(int &n, float &da, float &zx, int &incx) { + sscal_(&n, &da, &zx, &incx); +} + +static void call_rot(int &n, std::complex &zx, int &incx, + std::complex &zy, int &incy, double &c, + double &s) { + zdrot_(&n, &zx, &incx, &zy, &incy, &c, &s); +} + +static void call_rot(int &n, std::complex &zx, int &incx, + std::complex &zy, int &incy, float &c, float &s) { + csrot_(&n, &zx, &incx, &zy, &incy, &c, &s); +} + +static void call_rot(int &n, double &dx, int &incx, double &dy, int &incy, + double &c, double &s) { + drot_(&n, &dx, &incx, &dy, &incy, &c, &s); +} + +static void call_rot(int &n, float &dx, int &incx, float &dy, int &incy, + float &c, float &s) { + srot_(&n, &dx, &incx, &dy, &incy, &c, &s); +} + +static void call_lasv2(double &f, double &g, double &h, double &ssmin, + double &ssmax, double &snr, double &csr, double &snl, + double &csl) { + dlasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl); +} + +static void call_lasv2(float &f, float &g, float &h, float &ssmin, float &ssmax, + float &snr, float &csr, float &snl, float &csl) { + slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl); +} + +static void call_lasq1(int &n, double &D_, double &E_, double &rwork_, + int &info_arg) { + dlasq1_(&n, &D_, &E_, &rwork_, &info_arg); +}; + +static void call_lasq1(int &n, float &D_, float &E_, float &rwork_, + int &info_arg) { + slasq1_(&n, &D_, &E_, &rwork_, &info_arg); +}; + +static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n, + double &c, double &s, std::complex &A, int &lda) { + zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); +}; + +static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n, + float &c, float &s, std::complex &A, int &lda) { + clasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); +}; + +static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n, + float &c, float &s, float &A, int &lda) { + slasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); +}; + +static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n, + double &c, double &s, double &A, int &lda) { + dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); +}; + +template +static void bdsqr_single_template(char uplo, I n, +#ifdef USE_VT + I ncvt, +#else + I ncv, +#endif + I nru, I ncc, T *d_, + T *e_, +#ifdef USE_VT + T *vt_, I ldvt, +#else + T *v_, I ldv, +#endif + T *u_, I ldu, T *c_, + int ldc, S *work_, I &info, + S *dwork = nullptr, hipStream_t stream = 0) { + + bool const use_gpu = (dwork != nullptr); + + S const zero = 0; + S const one = 1; + S negone = -1; + S const hndrd = 100; + S const hndrth = one / hndrd; + S const ten = 10; + S const eight = 8; + S const meight = -one / eight; + I const maxitr = 6; + I ione = 1; + + I nrv = n; + + bool const lower = (uplo == 'L') || (uplo == 'l'); + bool const upper = (uplo == 'U') || (uplo == 'u'); + /* + * rotate is true if any singular vectors desired, false otherwise + */ +#ifdef USE_VT + bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0); +#else + bool const rotate = (nrv > 0) || (nru > 0) || (ncc > 0); +#endif + + I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, + m = 0, maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0; + + I const nrc = n; // number of rows in C matrix + I const nrvt = n; // number of rows in VT matrix + I const ncu = n; // number of columns in U matrix + + S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, + h = 0, mu = 0, oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, + sinl = 0, sinr = 0, sll = 0, smax = 0, smin = 0, sminl = 0, sminoa = 0, + sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0; + + /* .. + * .. external functions .. + logical lsame + double precision dlamch + external lsame, dlamch + * .. + * .. external subroutines .. + external dlartg, dlas2, dlasq1, dlasr, dlasv2, drot, + $ dscal, dswap, xerbla + * .. + * .. intrinsic functions .. + intrinsic abs, dble, max, min, sign, sqrt + */ + + auto call_swap_gpu = [=](I n, T &x, I incx, T &y, I incy) { + swap_template(n, &x, incx, &y, incy, stream); + }; + + auto call_rot_gpu = [=](I n, T &x, I incx, T &y, I incy, S cosl, S sinl) { + rot_template(n, &x, incx, &y, incy, cosl, sinl, stream); + }; + + auto call_scal_gpu = [=](I n, auto da, T &x, I incx) { + scal_template(n, da, &x, incx, stream); + }; + + auto call_lasr_gpu = [=](char const side, char const pivot, char const direct, + I const m, I const n, S &c, S &s, T &A, I const lda, + S *const dwork, hipStream_t stream) { + bool const is_left_side = (side == 'L') || (side == 'l'); + auto const mn = (is_left_side) ? m : n; + auto const mn_m1 = (mn - 1); + S *const dc = dwork; + S *const ds = dwork + mn_m1; + HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, + stream)); + HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, + stream)); + + lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream); + HIP_CHECK(hipStreamSynchronize(stream)); + }; + + auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); }; + + auto indx2f = [](auto i, auto j, auto ld) -> int64_t { + assert((1 <= i) && (i <= ld)); + assert((1 <= j)); + return ((i - 1) + (j - 1) * int64_t(ld)); + }; + + auto d = [=](auto i) -> T & { + assert((1 <= i) && (i <= n)); + return (d_[i - 1]); + }; + + auto e = [=](auto i) -> T & { + assert((1 <= i) && (i <= (n - 1))); + return (e_[i - 1]); + }; + auto work = [=](auto i) -> S & { return (work_[i - 1]); }; + + auto c = [=](auto i, auto j) -> T & { + assert((1 <= i) && (i <= nrc) && (nrc <= ldc)); + assert((1 <= j) && (j <= ncc)); + return (c_[indx2f(i, j, ldc)]); + }; + + auto u = [=](auto i, auto j) -> T & { + assert((1 <= i) && (i <= nru) && (nru <= ldu)); + assert((1 <= j) && (j <= ncu)); + return (u_[indx2f(i, j, ldu)]); + }; + +#ifdef USE_VT + auto vt = [=](auto i, auto j) -> T & { + assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt)); + assert((1 <= j) && (j <= ncvt)); + return (vt_[indx2f(i, j, ldvt)]); + }; +#else + auto v = [=](auto i, auto j) -> T & { + assert((1 <= i) && (i <= nrv) && (nrv <= ldv)); + assert((1 <= j) && (j <= ncv)); + return (v_[indx2f(i, j, ldv)]); + }; +#endif + + // --------------------------- + // emulate Fortran intrinsics + // --------------------------- + auto sign = [](auto a, auto b) { + auto const abs_a = std::abs(a); + return ((b >= 0) ? abs_a : -abs_a); + }; + + auto dble = [](auto x) { return (static_cast(x)); }; + + auto max = [](auto a, auto b) { return ((a > b) ? a : b); }; + + auto min = [](auto a, auto b) { return ((a < b) ? a : b); }; + + auto sqrt = [](auto x) { return (std::sqrt(x)); }; + + /* .. + * .. executable statements .. + * + * test the input parameters. + * + */ + + info = (!upper) && (!lower) ? -1 + : (n < 0) ? -2 +#ifdef USE_VT + : (ncvt < 0) ? -3 +#else + : (nrv < 0) ? -3 +#endif + : (nru < 0) ? -4 + : (ncc < 0) ? -5 + : (ldu < max(1, nru)) ? -11 + : 0; + + if (info != 0) + return; + + if (n == 0) + return; + if (n == 1) + goto L160; + /* + * if no singular vectors desired, use qd algorithm + */ + if (!rotate) { + call_lasq1(n, d(1), e(1), work(1), info); + /* + * if info equals 2, dqds didn't finish, try to finish + */ + if (info != 2) + return; + info = 0; + } + + nm1 = n - 1; + nm12 = nm1 + nm1; + nm13 = nm12 + nm1; + idir = 0; + /* + * get machine constants + * + */ + { + char cmach_eps = 'E'; + char cmach_unfl = 'S'; + call_lamch(cmach_eps, eps); + call_lamch(cmach_unfl, unfl); + } + /* + * if matrix lower bidiagonal, rotate to be upper bidiagonal + * by applying givens rotations on the left + */ + + if (lower) { + // do 10 i = 1, n - 1 + for (i = 1; i <= (n - 1); i++) { + call_lartg(d(i), e(i), cs, sn, r); + d(i) = r; + e(i) = sn * d(i + 1); + d(i + 1) = cs * d(i + 1); + work(i) = cs; + work(nm1 + i) = sn; + } + L10: + + /* + * update singular vectors if desired + */ + + if (nru > 0) { + // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu ); + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + if (use_gpu) { + + call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), + ldu, dwork, stream); + } else { + call_lasr(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu); + } + } + if (ncc > 0) { + // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc ); + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), + ldc, dwork, stream); + } else { + call_lasr(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc); + } + } + } + /* + * compute singular values to relative accuracy tol + * (by setting tol to be negative, algorithm will compute + * singular values to absolute accuracy abs(tol)*norm(input matrix)) + */ + + tolmul = max(ten, min(hndrd, pow(eps, meight))); + tol = tolmul * eps; + + /* + * compute approximate maximum, minimum singular values + */ + + /* + smax = zero + do 20 i = 1, n + smax = max( smax, abs( d( i ) ) ) + L20: + do 30 i = 1, n - 1 + smax = max( smax, abs( e( i ) ) ) + L30: + */ + smax = zero; + // do 20 i = 1, n + for (i = 1; i <= n; i++) { + smax = max(smax, abs(d(i))); + } +L20: + // do 30 i = 1, n - 1 + for (i = 1; i <= (n - 1); i++) { + smax = max(smax, abs(e(i))); + } +L30: + + sminl = zero; + if (tol >= zero) { + /* + * relative accuracy desired + */ + + sminoa = abs(d(1)); + if (sminoa == zero) + goto L50; + mu = sminoa; + // do 40 i = 2, n + for (i = 2; i <= n; i++) { + mu = abs(d(i)) * (mu / (mu + abs(e(i - 1)))); + sminoa = min(sminoa, mu); + if (sminoa == zero) + goto L50; + } + L40: + L50: + + sminoa = sminoa / sqrt(dble(n)); + thresh = max(tol * sminoa, maxitr * (n * (n * unfl))); + } else { + /* + * absolute accuracy desired + */ + + thresh = max(abs(tol) * smax, maxitr * (n * (n * unfl))); + } + /* + * prepare for main iteration loop for the singular values + * (maxit is the maximum number of passes through the inner + * loop permitted before nonconvergence signalled.) + */ + maxitdivn = maxitr * n; + iterdivn = 0; + iter = -1; + oldll = -1; + oldm = -1; + /* + * m points to last element of unconverged part of matrix + */ + m = n; + /* + * begin main iteration loop + */ +L60: + /* + * check for convergence or exceeding iteration count + */ + if (m <= 1) + goto L160; + + if (iter >= n) { + iter = iter - n; + iterdivn = iterdivn + 1; + if (iterdivn >= maxitdivn) + goto L200; + } + /* + * find diagonal block of matrix to work on + */ + if (tol < zero && abs(d(m)) <= thresh) + d(m) = zero; + + smax = abs(d(m)); + smin = smax; + // do 70 lll = 1, m - 1 + for (lll = 1; lll <= (m - 1); lll++) { + ll = m - lll; + abss = abs(d(ll)); + abse = abs(e(ll)); + if (tol < zero && abss <= thresh) + d(ll) = zero; + if (abse <= thresh) + goto L80; + smin = min(smin, abss); + smax = max(smax, max(abss, abse)); + } +L70: + ll = 0; + goto L90; +L80: + e(ll) = zero; + /* + * matrix splits since e(ll) = 0 + */ + if (ll == m - 1) { + /* + * convergence of bottom singular value, return to top of loop + */ + m = m - 1; + goto L60; + } +L90: + ll = ll + 1; + /* + * e(ll) through e(m-1) are nonzero, e(ll-1) is zero + */ + if (ll == m - 1) { + /* + * 2 by 2 block, handle separately + */ + call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl); + d(m - 1) = sigmx; + e(m - 1) = zero; + d(m) = sigmn; + /* + * compute singular vectors, if desired + */ +#ifdef USE_VT + if (ncvt > 0) { + if (use_gpu) { + call_rot_gpu(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr); + } else { + call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr); + } + } +#else + if (nrv > 0) { + if (use_gpu) { + call_rot_gpu(nrv, v(1,m - 1), 1, v(1,m), 1, cosr, sinr); + } else { + call_rot(nrv, v(1,m - 1), 1, v(1,m), 1, cosr, sinr); + } + } +#endif + if (nru > 0) { + if (use_gpu) { + call_rot_gpu(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl); + } else { + call_rot(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl); + } + } + if (ncc > 0) { + if (use_gpu) { + call_rot_gpu(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl); + } else { + call_rot(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl); + } + } + m = m - 2; + goto L60; + } + /* + * if working on new submatrix, choose shift direction + * (from larger end diagonal element towards smaller) + */ + if (ll > oldm || m < oldll) { + if (abs(d(ll)) >= abs(d(m))) { + /* + * chase bulge from top (big end) to bottom (small end) + */ + idir = 1; + } else { + /* + * chase bulge from bottom (big end) to top (small end) + */ + idir = 2; + } + } + /* + * apply convergence tests + */ + if (idir == 1) { + /* + * run convergence test in forward direction + * first apply standard test to bottom of matrix + */ + if (abs(e(m - 1)) <= abs(tol) * abs(d(m)) || + (tol < zero && abs(e(m - 1)) <= thresh)) { + e(m - 1) = zero; + goto L60; + } + + if (tol >= zero) { + /* + * if relative accuracy desired, + * apply convergence criterion forward + */ + mu = abs(d(ll)); + sminl = mu; + // do 100 lll = ll, m - 1 + for (lll = ll; lll <= (m - 1); lll++) { + if (abs(e(lll)) <= tol * mu) { + e(lll) = zero; + goto L60; + } + mu = abs(d(lll + 1)) * (mu / (mu + abs(e(lll)))); + sminl = min(sminl, mu); + } + // L100: + } + + } else { + /* + * run convergence test in backward direction + * first apply standard test to top of matrix + */ + if (abs(e(ll)) <= abs(tol) * abs(d(ll)) || + (tol < zero && abs(e(ll)) <= thresh)) { + e(ll) = zero; + goto L60; + } + + if (tol >= zero) { + /* + * if relative accuracy desired, + * apply convergence criterion backward + */ + mu = abs(d(m)); + sminl = mu; + // do 110 lll = m - 1, ll, -1 + for (lll = (m - 1); lll >= ll; lll--) { + if (abs(e(lll)) <= tol * mu) { + e(lll) = zero; + goto L60; + } + mu = abs(d(lll)) * (mu / (mu + abs(e(lll)))); + sminl = min(sminl, mu); + } + // L110: + } + } + oldll = ll; + oldm = m; + /* + * compute shift. first, test if shifting would ruin relative + * accuracy, and if so set the shift to zero. + */ + if (tol >= zero && n * tol * (sminl / smax) <= max(eps, hndrth * tol)) { + /* + * use a zero shift to avoid loss of relative accuracy + */ + shift = zero; + } else { + /* + * compute the shift from 2-by-2 block at end of matrix + */ + if (idir == 1) { + sll = abs(d(ll)); + call_las2(d(m - 1), e(m - 1), d(m), shift, r); + } else { + sll = abs(d(m)); + call_las2(d(ll), e(ll), d(ll + 1), shift, r); + } + /* + * test if shift negligible, and if so set to zero + */ + if (sll > zero) { + if ((shift / sll) * (shift / sll) < eps) + shift = zero; + } + } + /* + * increment iteration count + */ + iter = iter + m - ll; + /* + * if shift = 0, do simplified qr iteration + */ + if (shift == zero) { + if (idir == 1) { + /* + * chase bulge from top to bottom + * save cosines and sines for later singular vector updates + */ + cs = one; + oldcs = one; + // do 120 i = ll, m - 1 + for (i = ll; i <= (m - 1); i++) { + auto di_cs = d(i) * cs; + call_lartg(di_cs, e(i), cs, sn, r); + if (i > ll) + e(i - 1) = oldsn * r; + auto oldcs_r = oldcs * r; + auto dip1_sn = d(i + 1) * sn; + call_lartg(oldcs_r, dip1_sn, oldcs, oldsn, d(i)); + work(i - ll + 1) = cs; + work(i - ll + 1 + nm1) = sn; + work(i - ll + 1 + nm12) = oldcs; + work(i - ll + 1 + nm13) = oldsn; + } + L120: + h = d(m) * cs; + d(m) = h * oldcs; + e(m - 1) = h * oldsn; + /* + * update singular vectors + */ +#ifdef USE_VT + if (ncvt > 0) { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( + // ll, 1 ), ldvt ) + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), + vt(ll, 1), ldvt, dwork, stream); + } else { + call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), + ldvt); + } + } +#else + + if (nrv > 0) { + // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), + // v(1,ll ), ldv) + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), + v(1,ll), ldv, dwork, stream); + } else { + call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1,ll), + ldv); + } + } +#endif + if (nru > 0) { + // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 + // ), u( 1, ll ), ldu ) + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), + work(nm13 + 1), u(1, ll), ldu, dwork, stream); + } else { + call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), + work(nm13 + 1), u(1, ll), ldu); + } + } + if (ncc > 0) { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 + // ), c( ll, 1 ), ldc ) + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), + work(nm13 + 1), c(ll, 1), ldc, dwork, stream); + } else { + call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), + work(nm13 + 1), c(ll, 1), ldc); + } + } + /* + * test convergence + */ + if (abs(e(m - 1)) <= thresh) + e(m - 1) = zero; + + } else { + /* + * chase bulge from bottom to top + * save cosines and sines for later singular vector updates + */ + cs = one; + oldcs = one; + // do 130 i = m, ll + 1, -1 + for (i = m; i >= (ll + 1); i--) { + auto di_cs = d(i) * cs; + call_lartg(di_cs, e(i - 1), cs, sn, r); + + if (i < m) + e(i) = oldsn * r; + + auto oldcs_r = oldcs * r; + auto dim1_sn = d(i - 1) * sn; + call_lartg(oldcs_r, dim1_sn, oldcs, oldsn, d(i)); + + work(i - ll) = cs; + work(i - ll + nm1) = -sn; + work(i - ll + nm12) = oldcs; + work(i - ll + nm13) = -oldsn; + } + L130: + h = d(ll) * cs; + d(ll) = h * oldcs; + e(ll) = h * oldsn; + /* + * update singular vectors + */ +#ifdef USE_VT + if (ncvt > 0) { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( + // nm13+1 + // ), vt( ll, 1 ), ldvt ); + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), + work(nm13 + 1), vt(ll, 1), ldvt, dwork, stream); + } else { + call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), + work(nm13 + 1), vt(ll, 1), ldvt); + } + } +#else + + if (nrv > 0) { + // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv ); + char side = 'R'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), + work(nm13 + 1), v(1,ll), ldv, dwork, stream); + } else { + call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), + work(nm13 + 1), v(1,ll), ldv); + } + } + + +#endif + + if (nru > 0) { + // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, + // ll + // ), ldu ) + char side = 'R'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), + u(1, ll), ldu, dwork, stream); + } else { + call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), + ldu); + } + } + if (ncc > 0) { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, + // 1 + // ), ldc ) + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), + c(ll, 1), ldc, dwork, stream); + } else { + call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), + ldc); + } + } + /* + * test convergence + */ + if (abs(e(ll)) <= thresh) + e(ll) = zero; + } + } else { + /* + * use nonzero shift + */ + if (idir == 1) { + /* + * chase bulge from top to bottom + * save cosines and sines for later singular vector updates + */ + f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll)); + g = e(ll); + // do 140 i = ll, m - 1 + for (i = ll; i <= (m - 1); i++) { + call_lartg(f, g, cosr, sinr, r); + if (i > ll) + e(i - 1) = r; + f = cosr * d(i) + sinr * e(i); + e(i) = cosr * e(i) - sinr * d(i); + g = sinr * d(i + 1); + d(i + 1) = cosr * d(i + 1); + call_lartg(f, g, cosl, sinl, r); + d(i) = r; + f = cosl * e(i) + sinl * d(i + 1); + d(i + 1) = cosl * d(i + 1) - sinl * e(i); + if (i < m - 1) { + g = sinl * e(i + 1); + e(i + 1) = cosl * e(i + 1); + } + work(i - ll + 1) = cosr; + work(i - ll + 1 + nm1) = sinr; + work(i - ll + 1 + nm12) = cosl; + work(i - ll + 1 + nm13) = sinl; + } + L140: + e(m - 1) = f; + /* + * update singular vectors + */ +#ifdef USE_VT + if (ncvt > 0) { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( + // ll, 1 ), ldvt ) + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), + vt(ll, 1), ldvt, dwork, stream); + } else { + call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), + ldvt); + } + } +#else + + if (nrv > 0) { + // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), v( 1, ll ), ldv ) + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), + v(1,ll), ldv, dwork, stream); + } else { + call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1,ll), + ldv); + } + } +#endif + + if (nru > 0) { + // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 + // ), u( 1, ll ), ldu ) + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), + work(nm13 + 1), u(1, ll), ldu, dwork, stream); + } else { + call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), + work(nm13 + 1), u(1, ll), ldu); + } + } + if (ncc > 0) { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 + // ), c( ll, 1 ), ldc ) + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), + work(nm13 + 1), c(ll, 1), ldc, dwork, stream); + } else { + call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), + work(nm13 + 1), c(ll, 1), ldc); + } + } + /* + * test convergence + */ + if (abs(e(m - 1)) <= thresh) + e(m - 1) = zero; + + } else { + /* + * chase bulge from bottom to top + * save cosines and sines for later singular vector updates + */ + f = (abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m)); + g = e(m - 1); + // do 150 i = m, ll + 1, -1 + for (i = m; i >= (ll + 1); i--) { + call_lartg(f, g, cosr, sinr, r); + if (i < m) + e(i) = r; + f = cosr * d(i) + sinr * e(i - 1); + e(i - 1) = cosr * e(i - 1) - sinr * d(i); + g = sinr * d(i - 1); + d(i - 1) = cosr * d(i - 1); + call_lartg(f, g, cosl, sinl, r); + d(i) = r; + f = cosl * e(i - 1) + sinl * d(i - 1); + d(i - 1) = cosl * d(i - 1) - sinl * e(i - 1); + if (i > ll + 1) { + g = sinl * e(i - 2); + e(i - 2) = cosl * e(i - 2); + } + work(i - ll) = cosr; + work(i - ll + nm1) = -sinr; + work(i - ll + nm12) = cosl; + work(i - ll + nm13) = -sinl; + } + L150: + e(ll) = f; + /* + * test convergence + */ + if (abs(e(ll)) <= thresh) + e(ll) = zero; + /* + * update singular vectors if desired + */ +#ifdef USE_VT + if (ncvt > 0) { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( + // nm13+1 + // ), vt( ll, 1 ), ldvt ) + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), + work(nm13 + 1), vt(ll, 1), ldvt, dwork, stream); + } else { + call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), + work(nm13 + 1), vt(ll, 1), ldvt); + } + } +#else + + if (nrv > 0) { + // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv ) + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), + work(nm13 + 1), v(1,ll), ldv, dwork, stream); + } else { + call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), + work(nm13 + 1), v(1,ll), ldv); + } + } +#endif + if (nru > 0) { + // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, + // ll + // ), ldu ) + char side = 'R'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), + u(1, ll), ldu, dwork, stream); + } else { + call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), + ldu); + } + } + if (ncc > 0) { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, + // 1 + // ), ldc ) + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if (use_gpu) { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), + c(ll, 1), ldc, dwork, stream); + } else { + call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), + ldc); + } + } + } + } + /* + * qr iteration finished, go back and check convergence + */ + goto L60; + +/* + * all singular values converged, so make them positive + */ +L160: + // do 170 i = 1, n + for (i = 1; i <= n; i++) { + if (d(i) < zero) { + d(i) = -d(i); + /* + * change sign of singular vectors, if desired + */ +#ifdef USE_VT + if (ncvt > 0) { + if (use_gpu) { + call_scal_gpu(ncvt, negone, vt(i, 1), ldvt); + } else { + call_scal(ncvt, negone, vt(i, 1), ldvt); + } + } +#else + + if (nrv > 0) { + if (use_gpu) { + call_scal_gpu(nrv, negone, v(1,i), 1); + } else { + call_scal(nrv, negone, v(1,i), 1); + } + } +#endif + } + } +L170: + + bool const need_sort = false; + if (need_sort) { + /* + * sort the singular values into decreasing order (insertion sort on + * singular values, but only one transposition per singular vector) + */ + // do 190 i = 1, n - 1 + for (i = 1; i <= (n - 1); i++) { + /* + * scan for smallest d(i) + */ + isub = 1; + smin = d(1); + // do 180 j = 2, n + 1 - i + for (j = 2; j <= (n + 1 - i); j++) { + if (d(j) <= smin) { + isub = j; + smin = d(j); + } + } + L180: + if (isub != n + 1 - i) { + /* + * swap singular values and vectors + */ + d(isub) = d(n + 1 - i); + d(n + 1 - i) = smin; +#ifdef USE_VT + if (ncvt > 0) { + if (use_gpu) { + call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + } else { + call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + } + } +#else + + if (nrv > 0) { + if (use_gpu) { + call_swap_gpu(nrv, v(1,isub), 1, v(1,n + 1 - i), 1); + } else { + call_swap(nrv, v(1,isub), 1, v(1,n + 1 - i), 1); + } + } +#endif + if (nru > 0) { + if (use_gpu) { + call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione); + } else { + call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione); + } + } + if (ncc > 0) { + if (use_gpu) { + call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + } else { + call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + } + } + } + } + } // end if (need_sort) + +L190: + goto L220; +/* + * maximum number of iterations exceeded, failure to converge + */ +L200: + info = 0; + // do 210 i = 1, n - 1 + for (i = 1; i <= (n - 1); i++) { + if (e(i) != zero) + info = info + 1; + } +L210: +L220: + return; + /* + * end of dbdsqr + */ +} + + + + +template +rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, + const rocblas_fill uplo_in, + const rocblas_int n, + const rocblas_int nv, + const rocblas_int nu, + const rocblas_int nc, + S* D, + const rocblas_stride strideD, + S* E, + const rocblas_stride strideE, + W1 V, + const rocblas_int shiftV, + const rocblas_int ldv, + const rocblas_stride strideV, + W2 U, + const rocblas_int shiftU, + const rocblas_int ldu, + const rocblas_stride strideU, + W3 C, + const rocblas_int shiftC, + const rocblas_int ldc, + const rocblas_stride strideC, + rocblas_int* info_array, + const rocblas_int batch_count, + rocblas_int* splits_map, + S* work) +{ + + // ------------------------- + // copy D into hD, E into hE + // ------------------------- + +hipStream_t stream; +rocblas_get_stream(handle, &stream ); + + + +S * hD = nullptr; +S * hE = nullptr; +HIP_CHECK( hipHostMalloc( &hD, sizeof(S) * n * batch_count )); +HIP_CHECK( hipHostMalloc( &hE, sizeof(S) * (n-1) * batch_count )); + +bool const use_single_copy_for_D = (strideD == n); +if (use_single_copy_for_D) { + void * const dst = (void *) &(hD[0]); + void * const src = (void *) D; + size_t const sizeBytes = sizeof(S) * n * batch_count; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + + HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); +} +else { + for(rocblas_int bid = 0; bid < batch_count; bid++) { + void * const dst = (void *) &(hD[ bid * n ]); + void * const src = (void *) D+ bid * strideD; + size_t const sizeBytes = sizeof(S) * n; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); + } +} + +bool const use_single_copy_for_E = (strideE == (n-1)); +if (use_single_copy_for_E) { + void * const dst = (void *) &(hE[0]); + void * const src = (void *) E; + size_t const sizeBytes = sizeof(S) * (n-1) * batch_count; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + + HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); +} +else { + for(rocblas_int bid = 0; bid < batch_count; bid++) { + void * const dst = (void *) &(hE[ bid * (n-1) ]); + void * const src = (void *) E+ bid * strideE; + size_t const sizeBytes = sizeof(S) * (n-1); + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); + } + +} + + + + + for(rocblas_int bid = 0; bid < batch_count; bid++) { + std::vector hwork( 4 * n ); + + char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U'; + T *d_ = &(hD[ bid * n ] ); + T *e_ = &(hE[ bid * (n-1) ]); + + T *v_ = (nv > 0) ? load_ptr_batch(V, bid, shiftV, strideV) : nullptr; + T *u_ = (nu > 0) ? load_ptr_batch(U, bid, shiftU, strideU) : nullptr; + T *c_ = (nc > 0) ? load_ptr_batch(C, bid, shiftC, strideC) : nullptr; + S * work_ = &(hwork[0]); + S * dwork = work; + I info = 0; + + I ncv = nv; + I nru = nu; + I ncc = nc; + +#ifdef USE_VT + T *vt_ = v_; + I ldvt = ldv; + + I nrv = n; + I ncvt = nrv; + bdsqr_single_template( uplo, n, ncvt, nru, ncc, d_, e_, + vt_,ldvt, + u_,ldu, c_,ldc, work_, info, dwork, stream ); +#else + + bdsqr_single_template( uplo, n, ncv, nru, ncc, d_, e_, + v_,ldv, + u_,ldu, c_,ldc, work_, info, dwork, stream ); + +#endif + + + + info_array[bid] = info; + } // end for bid + + +if (use_single_copy_for_D) { + void * const src = (void *) &(hD[0]); + void * const dst = (void *) D; + size_t const sizeBytes = sizeof(S) * n * batch_count; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); +} +else { + for(rocblas_int bid = 0; bid < batch_count; bid++) { + void * const src = (void *) &(hD[ bid * n ]); + void * const dst = (void *) D+ bid * strideD; + size_t const sizeBytes = sizeof(S) * n; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); + } +} + + +if (use_single_copy_for_E) { + void * const src = (void *) &(hE[0]); + void * const dst = (void *) E; + size_t const sizeBytes = sizeof(S) * (n-1) * batch_count; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); +} +else { + for(rocblas_int bid = 0; bid < batch_count; bid++) { + void * const src = (void *) &(hE[ bid * (n-1) ]); + void * const dst = (void *) E+ bid * strideE; + size_t const sizeBytes = sizeof(S) * (n-1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); + } + +} + +HIP_CHECK( hipHostFree( hD ) ); hD = nullptr; +HIP_CHECK( hipHostFree( hE ) ); hE = nullptr; + +return( rocblas_status_success ); + +} + + +ROCSOLVER_END_NAMESPACE From 3e40d99eb2d10d6c97c3169bbff8c1542de06d04 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Wed, 29 May 2024 12:29:03 -0400 Subject: [PATCH 02/35] snapshot compile ok --- library/src/auxiliary/bdsqr_host.hpp | 3622 ++++++++++-------- library/src/auxiliary/rocauxiliary_bdsqr.hpp | 38 +- 2 files changed, 2064 insertions(+), 1596 deletions(-) diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp index d40be74ab..5d3b0f6c9 100644 --- a/library/src/auxiliary/bdsqr_host.hpp +++ b/library/src/auxiliary/bdsqr_host.hpp @@ -43,823 +43,1110 @@ #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" - ROCSOLVER_BEGIN_NAMESPACE #ifndef HIP_CHECK -#define HIP_CHECK(fcn) \ - { \ - hipError_t const istat = (fcn); \ - assert(istat == hipSuccess); \ - } +#define HIP_CHECK(fcn) \ + { \ + hipError_t const istat = (fcn); \ + assert(istat == hipSuccess); \ + } #endif - template -__global__ static void lasr_kernel(char const side, char const pivot, - char const direct, I const m, I const n, - S const *const c_, S const *const s_, - T *const A_, I const lda) { - - const auto nblocks = hipGridDim_x; - const auto nthreads_per_block = hipBlockDim_x; - const auto nthreads = nblocks * nthreads_per_block; - const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - const auto i_inc = nthreads; - const auto ij_nb = nthreads; - const auto ij_start = tid; - - auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; - auto min = [](auto x, auto y) { return ((x < y) ? x : y); }; - - auto indx2f = [](auto i, auto j, auto lda) -> int64_t { - assert((1 <= i)); - assert((1 <= lda)); - assert((1 <= j)); - - return ((i - 1) + (j - 1) * int64_t(lda)); - }; - - auto indx1f = [](auto i) -> int64_t { - assert((1 <= i)); - return (i - int64_t(1)); - }; - - auto c = [&](auto i) -> const S & { return (c_[indx1f(i)]); }; - auto s = [&](auto i) -> const S & { return (s_[indx1f(i)]); }; - auto A = [&](auto i, auto j) -> T & { return (A_[indx2f(i, j, lda)]); }; - - const S one = 1; - const S zero = 0; - - // ---------------- - // check arguments - // ---------------- - - const bool is_side_Left = (side == 'L') || (side == 'l'); - const bool is_side_Right = (side == 'R') || (side == 'r'); - - const bool is_pivot_Variable = (pivot == 'V') || (pivot == 'v'); - const bool is_pivot_Bottom = (pivot == 'B') || (pivot == 'b'); - const bool is_pivot_Top = (pivot == 'T') || (pivot == 't'); - - const bool is_direct_Forward = (direct == 'F') || (direct == 'f'); - const bool is_direct_Backward = (direct == 'B') || (direct == 'b'); - - { - const bool isok_side = is_side_Left || is_side_Right; - const bool isok_pivot = - is_pivot_Variable || is_pivot_Bottom || is_pivot_Top; - const bool isok_direct = is_direct_Forward || is_direct_Backward; - - const I info = (!isok_side) ? 1 - : (!isok_pivot) ? 2 - : (!isok_direct) ? 3 - : (m < 0) ? 4 - : (n < 0) ? 5 - : (c_ == nullptr) ? 6 - : (s_ == nullptr) ? 7 - : (A_ == nullptr) ? 8 - : (lda < max(1, m)) ? 9 - : 0; - if (info != 0) - return; - }; - - { - const bool has_work = (m >= 1) && (n >= 1); - if (!has_work) { - return; +__global__ static void lasr_kernel(char const side, + char const pivot, + char const direct, + I const m, + I const n, + S const* const c_, + S const* const s_, + T* const A_, + I const lda) +{ + const auto nblocks = hipGridDim_x; + const auto nthreads_per_block = hipBlockDim_x; + const auto nthreads = nblocks * nthreads_per_block; + const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + const auto i_inc = nthreads; + const auto ij_nb = nthreads; + const auto ij_start = tid; + + auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; + auto min = [](auto x, auto y) { return ((x < y) ? x : y); }; + + auto indx2f = [](auto i, auto j, auto lda) -> int64_t { + assert((1 <= i)); + assert((1 <= lda)); + assert((1 <= j)); + + return ((i - 1) + (j - 1) * int64_t(lda)); }; - }; - - if (is_side_Left && is_pivot_Variable && is_direct_Forward) { - // ----------------------------- - // A := P*A - // Variable pivot, the plane (k,k+1) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- - { - for (I j = 1; j <= (m - 1); j++) { - const auto ctemp = c(j); - const auto stemp = s(j); - if ((ctemp != one) || (stemp != zero)) { - for (I i = 1 + tid; i <= n; i += i_inc) { - const auto temp = A(j + 1, i); - A(j + 1, i) = ctemp * temp - stemp * A(j, i); - A(j, i) = stemp * temp + ctemp * A(j, i); - } - }; - }; + auto indx1f = [](auto i) -> int64_t { + assert((1 <= i)); + return (i - int64_t(1)); }; - return; - }; - - if (is_side_Left && is_pivot_Variable && is_direct_Backward) { - // ----------------------------- - // A := P*A - // Variable pivot, the plane (k,k+1) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- - - auto const jend = (m - 1); - auto const jstart = 1; - auto const istart = 1; - auto const iend = n; - - for (I j = jend; j >= jstart; j--) { - const auto ctemp = c(j); - const auto stemp = s(j); - if ((ctemp != one) || (stemp != zero)) { - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(j + 1, i); - A(j + 1, i) = ctemp * temp - stemp * A(j, i); - A(j, i) = stemp * temp + ctemp * A(j, i); - }; - }; - }; + auto c = [&](auto i) -> const S& { return (c_[indx1f(i)]); }; + auto s = [&](auto i) -> const S& { return (s_[indx1f(i)]); }; + auto A = [&](auto i, auto j) -> T& { return (A_[indx2f(i, j, lda)]); }; - return; - }; - - if (is_side_Left && is_pivot_Top && is_direct_Forward) { - // ----------------------------- - // A := P*A - // Top pivot, the plane (1,k+1) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- - { - for (I j = 2; j <= m; j++) { - const auto ctemp = c(j - 1); - const auto stemp = s(j - 1); - for (I i = 1 + tid; i <= n; i += i_inc) { - const auto temp = A(j, i); - A(j, i) = ctemp * temp - stemp * A(1, i); - A(1, i) = stemp * temp + ctemp * A(1, i); - }; - }; - }; + const S one = 1; + const S zero = 0; - return; - }; - - if (is_side_Left && is_pivot_Top && is_direct_Backward) { - // ----------------------------- - // A := P*A - // Top pivot, the plane (1,k+1) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- - { + // ---------------- + // check arguments + // ---------------- - auto const jend = m; - auto const jstart = 2; - auto const istart = 1; - auto const iend = n; + const bool is_side_Left = (side == 'L') || (side == 'l'); + const bool is_side_Right = (side == 'R') || (side == 'r'); - for (I j = jend; j >= jstart; j--) { - const auto ctemp = c(j - 1); - const auto stemp = s(j - 1); - if ((ctemp != one) || (stemp != zero)) { - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(j, i); + const bool is_pivot_Variable = (pivot == 'V') || (pivot == 'v'); + const bool is_pivot_Bottom = (pivot == 'B') || (pivot == 'b'); + const bool is_pivot_Top = (pivot == 'T') || (pivot == 't'); - A(j, i) = ctemp * temp - stemp * A(1, i); - A(1, i) = stemp * temp + ctemp * A(1, i); - }; - }; - }; - } + const bool is_direct_Forward = (direct == 'F') || (direct == 'f'); + const bool is_direct_Backward = (direct == 'B') || (direct == 'b'); - return; - }; - - if (is_side_Left && is_pivot_Bottom && is_direct_Forward) { - // ----------------------------- - // A := P*A - // Bottom pivot, the plane (k,z) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- { + const bool isok_side = is_side_Left || is_side_Right; + const bool isok_pivot = is_pivot_Variable || is_pivot_Bottom || is_pivot_Top; + const bool isok_direct = is_direct_Forward || is_direct_Backward; + + const I info = (!isok_side) ? 1 + : (!isok_pivot) ? 2 + : (!isok_direct) ? 3 + : (m < 0) ? 4 + : (n < 0) ? 5 + : (c_ == nullptr) ? 6 + : (s_ == nullptr) ? 7 + : (A_ == nullptr) ? 8 + : (lda < max(1, m)) ? 9 + : 0; + if(info != 0) + return; + }; - auto const jstart = 1; - auto const jend = (m - 1); - auto const istart = 1; - auto const iend = n; - - for (I j = jstart; j <= jend; j++) { - const auto ctemp = c(j); - const auto stemp = s(j); - if ((ctemp != one) || (stemp != zero)) { - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(j, i); - A(j, i) = stemp * A(m, i) + ctemp * temp; - A(m, i) = ctemp * A(m, i) - stemp * temp; - }; + { + const bool has_work = (m >= 1) && (n >= 1); + if(!has_work) + { + return; }; - }; - } + }; - return; - }; - - if (is_side_Left && is_pivot_Bottom && is_direct_Backward) { - // ----------------------------- - // A := P*A - // Bottom pivot, the plane (k,z) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- + if(is_side_Left && is_pivot_Variable && is_direct_Forward) { - - auto const jend = (m - 1); - auto const jstart = 1; - auto const istart = 1; - auto const iend = n; - - for (I j = jend; j >= jstart; j--) { - const auto ctemp = c(j); - const auto stemp = s(j); - if ((ctemp != one) || (stemp != zero)) { - - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(j, i); - A(j, i) = stemp * A(m, i) + ctemp * temp; - A(m, i) = ctemp * A(m, i) - stemp * temp; - }; + // ----------------------------- + // A := P*A + // Variable pivot, the plane (k,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + { + for(I j = 1; j <= (m - 1); j++) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = 1 + tid; i <= n; i += i_inc) + { + const auto temp = A(j + 1, i); + A(j + 1, i) = ctemp * temp - stemp * A(j, i); + A(j, i) = stemp * temp + ctemp * A(j, i); + } + }; + }; }; - }; - } - - return; - }; - if (is_side_Right && is_pivot_Variable && is_direct_Forward) { - // ----------------------------- - // A := A*P**T - // Variable pivot, the plane (k,k+1) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- + return; + }; + if(is_side_Left && is_pivot_Variable && is_direct_Backward) { - - auto const jstart = 1; - auto const jend = (n - 1); - auto const istart = 1; - auto const iend = m; - - for (I j = jstart; j <= jend; j++) { - const auto ctemp = c(j); - const auto stemp = s(j); - if ((ctemp != one) || (stemp != zero)) { - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(i, j + 1); - A(i, j + 1) = ctemp * temp - stemp * A(i, j); - A(i, j) = stemp * temp + ctemp * A(i, j); - }; + // ----------------------------- + // A := P*A + // Variable pivot, the plane (k,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + + auto const jend = (m - 1); + auto const jstart = 1; + auto const istart = 1; + auto const iend = n; + + for(I j = jend; j >= jstart; j--) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(j + 1, i); + A(j + 1, i) = ctemp * temp - stemp * A(j, i); + A(j, i) = stemp * temp + ctemp * A(j, i); + }; + }; }; - }; - } - - return; - }; - if (is_side_Right && is_pivot_Variable && is_direct_Backward) { - // ----------------------------- - // A := A*P**T - // Variable pivot, the plane (k,k+1) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- + return; + }; + if(is_side_Left && is_pivot_Top && is_direct_Forward) { - - auto const jend = (n - 1); - auto const jstart = 1; - auto const istart = 1; - auto const iend = m; - - for (I j = jend; j >= jstart; j--) { - const auto ctemp = c(j); - const auto stemp = s(j); - if ((ctemp != one) || (stemp != zero)) { - - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(i, j + 1); - A(i, j + 1) = ctemp * temp - stemp * A(i, j); - A(i, j) = stemp * temp + ctemp * A(i, j); - }; + // ----------------------------- + // A := P*A + // Top pivot, the plane (1,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + { + for(I j = 2; j <= m; j++) + { + const auto ctemp = c(j - 1); + const auto stemp = s(j - 1); + for(I i = 1 + tid; i <= n; i += i_inc) + { + const auto temp = A(j, i); + A(j, i) = ctemp * temp - stemp * A(1, i); + A(1, i) = stemp * temp + ctemp * A(1, i); + }; + }; }; - }; - } - return; - }; - if (is_side_Right && is_pivot_Top && is_direct_Forward) { - // ----------------------------- - // A := A*P**T - // Top pivot, the plane (1,k+1) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- + return; + }; + if(is_side_Left && is_pivot_Top && is_direct_Backward) { + // ----------------------------- + // A := P*A + // Top pivot, the plane (1,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + { + auto const jend = m; + auto const jstart = 2; + auto const istart = 1; + auto const iend = n; + + for(I j = jend; j >= jstart; j--) + { + const auto ctemp = c(j - 1); + const auto stemp = s(j - 1); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(j, i); + + A(j, i) = ctemp * temp - stemp * A(1, i); + A(1, i) = stemp * temp + ctemp * A(1, i); + }; + }; + }; + } - auto const jstart = 2; - auto const jend = n; - auto const istart = 1; - auto const iend = m; - - for (I j = jstart; j <= jend; j++) { - const auto ctemp = c(j - 1); - const auto stemp = s(j - 1); - if ((ctemp != one) || (stemp != zero)) { - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(i, j); - - A(i, j) = ctemp * temp - stemp * A(i, 1); - A(i, 1) = stemp * temp + ctemp * A(i, 1); - }; - }; - }; - } + return; + }; - return; - }; + if(is_side_Left && is_pivot_Bottom && is_direct_Forward) + { + // ----------------------------- + // A := P*A + // Bottom pivot, the plane (k,z) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + { + auto const jstart = 1; + auto const jend = (m - 1); + auto const istart = 1; + auto const iend = n; + + for(I j = jstart; j <= jend; j++) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(j, i); + A(j, i) = stemp * A(m, i) + ctemp * temp; + A(m, i) = ctemp * A(m, i) - stemp * temp; + }; + }; + }; + } - if (is_side_Right && is_pivot_Top && is_direct_Backward) { - // ----------------------------- - // A := A*P**T - // Top pivot, the plane (1,k+1) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- + return; + }; + if(is_side_Left && is_pivot_Bottom && is_direct_Backward) { + // ----------------------------- + // A := P*A + // Bottom pivot, the plane (k,z) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + { + auto const jend = (m - 1); + auto const jstart = 1; + auto const istart = 1; + auto const iend = n; + + for(I j = jend; j >= jstart; j--) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(j, i); + A(j, i) = stemp * A(m, i) + ctemp * temp; + A(m, i) = ctemp * A(m, i) - stemp * temp; + }; + }; + }; + } - auto const jend = n; - auto const jstart = 2; - auto const istart = 1; - auto const iend = m; - - for (I j = jend; j >= jstart; j--) { - const auto ctemp = c(j - 1); - const auto stemp = s(j - 1); - if ((ctemp != one) || (stemp != zero)) { - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(i, j); + return; + }; - A(i, j) = ctemp * temp - stemp * A(i, 1); - A(i, 1) = stemp * temp + ctemp * A(i, 1); - }; - }; - }; - } + if(is_side_Right && is_pivot_Variable && is_direct_Forward) + { + // ----------------------------- + // A := A*P**T + // Variable pivot, the plane (k,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + + { + auto const jstart = 1; + auto const jend = (n - 1); + auto const istart = 1; + auto const iend = m; + + for(I j = jstart; j <= jend; j++) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(i, j + 1); + A(i, j + 1) = ctemp * temp - stemp * A(i, j); + A(i, j) = stemp * temp + ctemp * A(i, j); + }; + }; + }; + } - return; - }; + return; + }; - if (is_side_Right && is_pivot_Bottom && is_direct_Forward) { - // ----------------------------- - // A := A*P**T - // Bottom pivot, the plane (k,z) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- + if(is_side_Right && is_pivot_Variable && is_direct_Backward) + { + // ----------------------------- + // A := A*P**T + // Variable pivot, the plane (k,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + + { + auto const jend = (n - 1); + auto const jstart = 1; + auto const istart = 1; + auto const iend = m; + + for(I j = jend; j >= jstart; j--) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(i, j + 1); + A(i, j + 1) = ctemp * temp - stemp * A(i, j); + A(i, j) = stemp * temp + ctemp * A(i, j); + }; + }; + }; + } + return; + }; + if(is_side_Right && is_pivot_Top && is_direct_Forward) { + // ----------------------------- + // A := A*P**T + // Top pivot, the plane (1,k+1) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + + { + auto const jstart = 2; + auto const jend = n; + auto const istart = 1; + auto const iend = m; + + for(I j = jstart; j <= jend; j++) + { + const auto ctemp = c(j - 1); + const auto stemp = s(j - 1); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(i, j); + + A(i, j) = ctemp * temp - stemp * A(i, 1); + A(i, 1) = stemp * temp + ctemp * A(i, 1); + }; + }; + }; + } - auto const jstart = 1; - auto const jend = (n - 1); - auto const istart = 1; - auto const iend = m; + return; + }; - for (I j = jstart; j <= jend; j++) { - const auto ctemp = c(j); - const auto stemp = s(j); - if ((ctemp != one) || (stemp != zero)) { - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(i, j); + if(is_side_Right && is_pivot_Top && is_direct_Backward) + { + // ----------------------------- + // A := A*P**T + // Top pivot, the plane (1,k+1) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + + { + auto const jend = n; + auto const jstart = 2; + auto const istart = 1; + auto const iend = m; + + for(I j = jend; j >= jstart; j--) + { + const auto ctemp = c(j - 1); + const auto stemp = s(j - 1); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(i, j); + + A(i, j) = ctemp * temp - stemp * A(i, 1); + A(i, 1) = stemp * temp + ctemp * A(i, 1); + }; + }; + }; + } - A(i, j) = stemp * A(i, n) + ctemp * temp; - A(i, n) = ctemp * A(i, n) - stemp * temp; - }; - }; - }; - } + return; + }; - return; - }; + if(is_side_Right && is_pivot_Bottom && is_direct_Forward) + { + // ----------------------------- + // A := A*P**T + // Bottom pivot, the plane (k,z) + // P = P(z-1) * ... * P(2) * P(1) + // ----------------------------- + + { + auto const jstart = 1; + auto const jend = (n - 1); + auto const istart = 1; + auto const iend = m; + + for(I j = jstart; j <= jend; j++) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(i, j); + + A(i, j) = stemp * A(i, n) + ctemp * temp; + A(i, n) = ctemp * A(i, n) - stemp * temp; + }; + }; + }; + } - if (is_side_Right && is_pivot_Bottom && is_direct_Backward) { - // ----------------------------- - // A := A*P**T - // Bottom pivot, the plane (k,z) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- + return; + }; + if(is_side_Right && is_pivot_Bottom && is_direct_Backward) { + // ----------------------------- + // A := A*P**T + // Bottom pivot, the plane (k,z) + // P = P(1)*P(2)*...*P(z-1) + // ----------------------------- + + { + auto const jend = (n - 1); + auto const jstart = 1; + auto const istart = 1; + auto const iend = m; + + for(I j = jend; j >= jstart; j--) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(i, j); + A(i, j) = stemp * A(i, n) + ctemp * temp; + A(i, n) = ctemp * A(i, n) - stemp * temp; + }; + }; + }; + } - auto const jend = (n - 1); - auto const jstart = 1; - auto const istart = 1; - auto const iend = m; - - for (I j = jend; j >= jstart; j--) { - const auto ctemp = c(j); - const auto stemp = s(j); - if ((ctemp != one) || (stemp != zero)) { - for (I i = istart + tid; i <= iend; i += i_inc) { - const auto temp = A(i, j); - A(i, j) = stemp * A(i, n) + ctemp * temp; - A(i, n) = ctemp * A(i, n) - stemp * temp; - }; - }; - }; - } + return; + }; return; - }; - - return; } template -static void lasr_template_gpu(char const side, char const pivot, - char const direct, I const m, I const n, - S const *const c_, S const *const s_, T *const A_, - I const lda, hipStream_t stream = 0) { - - auto const nthreads = 2 * warpSize; +static void lasr_template_gpu(char const side, + char const pivot, + char const direct, + I const m, + I const n, + S const* const c_, + S const* const s_, + T* const A_, + I const lda, + hipStream_t stream = 0) +{ + auto const nthreads = 2 * warpSize; - bool const is_left_side = (side == 'L') || (side == 'l'); - auto const mn = (is_left_side) ? n : m; + bool const is_left_side = (side == 'L') || (side == 'l'); + auto const mn = (is_left_side) ? n : m; - auto const nblocks = (mn - 1) / nthreads + 1; - hipLaunchKernelGGL((lasr_kernel), dim3(nblocks, 1, 1), - dim3(nthreads, 1, 1), 0, stream, side, pivot, direct, m, n, - c_, s_, A_, lda); + auto const nblocks = (mn - 1) / nthreads + 1; + hipLaunchKernelGGL((lasr_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + side, pivot, direct, m, n, c_, s_, A_, lda); } template -__global__ static void rot_kernel(I const n, T *const x, I const incx, - T *const y, I const incy, S const c, - S const s) { - if (n <= 0) - return; +__global__ static void + rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s) +{ + if(n <= 0) + return; - I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - I const i_inc = hipBlockDim_x * hipGridDim_x; - - if ((incx == 1) && (incy == 1)) { - // ------------ - // special case - // ------------ - for (I i = i_start; i < n; i += i_inc) { - auto const temp = c * x[i] + s * y[i]; - y[i] = c * y[i] - s * x[i]; - x[i] = temp; - } - } else { - // --------------------------- - // code for unequal increments - // --------------------------- + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; - for (auto i = i_start; i < n; i += i_inc) { - auto const ix = 0 + i * static_cast(incx); - auto const iy = 0 + i * static_cast(incy); - auto const temp = c * x[ix] + s * y[iy]; - y[iy] = c * y[iy] - s * x[ix]; - x[ix] = temp; + if((incx == 1) && (incy == 1)) + { + // ------------ + // special case + // ------------ + for(I i = i_start; i < n; i += i_inc) + { + auto const temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } + } + else + { + // --------------------------- + // code for unequal increments + // --------------------------- + + for(auto i = i_start; i < n; i += i_inc) + { + auto const ix = 0 + i * static_cast(incx); + auto const iy = 0 + i * static_cast(incy); + auto const temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + } } - } } template -static void rot_template(I const n, T *x, I const incx, T *y, I const incy, - S const c, S const s, hipStream_t stream) { - auto nthreads = warpSize * 2; - auto nblocks = (n - 1) / nthreads + 1; - - hipLaunchKernelGGL((rot_kernel), dim3(nblocks, 1, 1), - dim3(nthreads, 1, 1), 0, stream, n, x, incx, y, incy, c, - s); +static void + rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((rot_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, x, incx, y, incy, c, s); } template -__global__ static void scal_kernel(I const n, S const da, T *const x, - I const incx) { - - if (n <= 0) - return; +__global__ static void scal_kernel(I const n, S const da, T* const x, I const incx) +{ + if(n <= 0) + return; - I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - I const i_inc = hipBlockDim_x * hipGridDim_x; + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; - S const zero = 0; - bool const is_da_zero = (da == zero); - if (incx == 1) { - for (I i = i_start; i < n; i += i_inc) { - x[i] = (is_da_zero) ? zero : da * x[i]; + S const zero = 0; + bool const is_da_zero = (da == zero); + if(incx == 1) + { + for(I i = i_start; i < n; i += i_inc) + { + x[i] = (is_da_zero) ? zero : da * x[i]; + } } - } else { - // --------------------------- - // code for non-unit increments - // --------------------------- - - for (I i = i_start; i < n; i += i_inc) { - auto const ix = 0 + i * static_cast(incx); - x[ix] = (is_da_zero) ? zero : da * x[ix]; + else + { + // --------------------------- + // code for non-unit increments + // --------------------------- + + for(I i = i_start; i < n; i += i_inc) + { + auto const ix = 0 + i * static_cast(incx); + x[ix] = (is_da_zero) ? zero : da * x[ix]; + } } - } } template -static void scal_template(I const n, S const da, T *const x, I const incx, - hipStream_t stream) { - auto nthreads = warpSize * 2; - auto nblocks = (n - 1) / nthreads + 1; +static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; - hipLaunchKernelGGL((scal_kernel), dim3(nblocks, 1, 1), - dim3(nthreads, 1, 1), 0, stream, n, da, x, incx); + hipLaunchKernelGGL((scal_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, da, x, incx); } template -__global__ static void swap_kernel(I const n, T *const x, I const incx, - T *const y, I const incy) { - if (n <= 0) - return; - - I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - I const i_inc = hipBlockDim_x * hipGridDim_x; - - if ((incx == 1) && (incy == 1)) { - // ------------ - // special case - // ------------ - for (I i = i_start; i < n; i += i_inc) { - auto const temp = y[i]; - y[i] = x[i]; - x[i] = temp; - } - } else { - // --------------------------- - // code for unequal increments - // --------------------------- +__global__ static void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy) +{ + if(n <= 0) + return; - for (I i = i_start; i < n; i += i_inc) { - auto const ix = 0 + i * static_cast(incx); - auto const iy = 0 + i * static_cast(incy); + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; - auto const temp = y[iy]; - y[iy] = x[ix]; - x[ix] = temp; + if((incx == 1) && (incy == 1)) + { + // ------------ + // special case + // ------------ + for(I i = i_start; i < n; i += i_inc) + { + auto const temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + else + { + // --------------------------- + // code for unequal increments + // --------------------------- + + for(I i = i_start; i < n; i += i_inc) + { + auto const ix = 0 + i * static_cast(incx); + auto const iy = 0 + i * static_cast(incy); + + auto const temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + } } - } } template -static void swap_template(I const n, T *x, I const incx, T *y, I const incy, - hipStream_t stream) { - auto nthreads = warpSize * 2; - auto nblocks = (n - 1) / nthreads + 1; +static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; - hipLaunchKernelGGL((swap_kernel), dim3(nblocks, 1, 1), - dim3(nthreads, 1, 1), 0, stream, n, x, incx, y, incy); + hipLaunchKernelGGL((swap_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, x, incx, y, incy); } extern "C" { -double dlamch_(char *cmach); -float slamch_(char *cmach); - -void zswap_(int *n, std::complex *zx, int *incx, - std::complex *zy, int *incy); - -void cswap_(int *n, std::complex *zx, int *incx, std::complex *zy, - int *incy); +double dlamch_(char* cmach); +float slamch_(char* cmach); + +void zswap_(int* n, std::complex* zx, int* incx, std::complex* zy, int* incy); + +void cswap_(int* n, std::complex* zx, int* incx, std::complex* zy, int* incy); + +void dswap_(int* n, double* zx, int* incx, double* zy, int* incy); + +void sswap_(int* n, float* zx, int* incx, float* zy, int* incy); + +void dlasq1_(int* n, double* D_, double* E_, double* rwork_, int* info_arg); +void slasq1_(int* n, float* D_, float* E_, float* rwork_, int* info_arg); + +void zlasr_(char* side, + char* pivot, + char* direct, + int* m, + int* n, + double* c, + double* s, + std::complex* A, + int* lda); +void clasr_(char* side, + char* pivot, + char* direct, + int* m, + int* n, + float* c, + float* s, + std::complex* A, + int* lda); +void slasr_(char* side, char* pivot, char* direct, int* m, int* n, float* c, float* s, float* A, int* lda); +void dlasr_(char* side, char* pivot, char* direct, int* m, int* n, double* c, double* s, double* A, int* lda); + +void dlasv2_(double* f, + double* g, + double* h, + double* ssmin, + double* ssmax, + double* snr, + double* csr, + double* snl, + double* csl); +void slasv2_(float* f, + float* g, + float* h, + float* ssmin, + float* ssmax, + float* snr, + float* csr, + float* snl, + float* csl); + +void zdrot_(int* n, + std::complex* zx, + int* incx, + std::complex* zy, + int* incy, + double* c, + double* s); + +void csrot_(int* n, + std::complex* zx, + int* incx, + std::complex* zy, + int* incy, + float* c, + float* s); + +void drot_(int* n, double* dx, int* incx, double* dy, int* incy, double* c, double* s); + +void srot_(int* n, float* dx, int* incx, float* dy, int* incy, float* c, float* s); + +void zdscal_(int* n, double* da, std::complex* zx, int* incx); +void csscal_(int* n, float* da, std::complex* zx, int* incx); +void zscal_(int* n, std::complex* za, std::complex* zx, int* incx); +void cscal_(int* n, std::complex* za, std::complex* zx, int* incx); +void dscal_(int* n, double* da, double* zx, int* incx); +void sscal_(int* n, float* da, float* zx, int* incx); + +void dlartg_(double* f, double* g, double* c, double* s, double* r); +void slartg_(float* f, float* g, float* c, float* s, float* r); +void zlartg_(std::complex* f, + std::complex* g, + double* c, + std::complex* s, + std::complex* r); +void clartg_(std::complex* f, + std::complex* g, + float* c, + std::complex* s, + std::complex* r); + +void dlas2_(double* f, double* g, double* h, double* ssmin, double* ssmax); +void slas2_(float* f, float* g, float* h, float* ssmin, float* ssmax); +}; -void dswap_(int *n, double *zx, int *incx, double *zy, int *incy); +static void call_lamch(char& cmach_arg, double& eps) +{ + char cmach = cmach_arg; + eps = dlamch_(&cmach); +} -void sswap_(int *n, float *zx, int *incx, float *zy, int *incy); +static void call_lamch(char& cmach_arg, float& eps) +{ + char cmach = cmach_arg; + eps = slamch_(&cmach); +} -void dlasq1_(int *n, double *D_, double *E_, double *rwork_, int *info_arg); -void slasq1_(int *n, float *D_, float *E_, float *rwork_, int *info_arg); +static void call_swap(int& n, + rocblas_complex_num& zx, + int& incx, + rocblas_complex_num& zy, + int& incy) +{ + cswap_(&n, (std::complex*)&zx, &incx, (std::complex*)&zy, &incy); +} -void zlasr_(char *side, char *pivot, char *direct, int *m, int *n, double *c, - double *s, std::complex *A, int *lda); -void clasr_(char *side, char *pivot, char *direct, int *m, int *n, float *c, - float *s, std::complex *A, int *lda); -void slasr_(char *side, char *pivot, char *direct, int *m, int *n, float *c, - float *s, float *A, int *lda); -void dlasr_(char *side, char *pivot, char *direct, int *m, int *n, double *c, - double *s, double *A, int *lda); +static void call_swap(int& n, std::complex& zx, int& incx, std::complex& zy, int& incy) +{ + cswap_(&n, &zx, &incx, &zy, &incy); +} -void dlasv2_(double *f, double *g, double *h, double *ssmin, double *ssmax, - double *snr, double *csr, double *snl, double *csl); -void slasv2_(float *f, float *g, float *h, float *ssmin, float *ssmax, - float *snr, float *csr, float *snl, float *csl); +static void call_swap(int& n, + rocblas_complex_num& zx, + int& incx, + rocblas_complex_num& zy, + int& incy) +{ + zswap_(&n, (std::complex*)&zx, &incx, (std::complex*)&zy, &incy); +} -void zdrot_(int *n, std::complex *zx, int *incx, - std::complex *zy, int *incy, double *c, double *s); +static void call_swap(int& n, std::complex& zx, int& incx, std::complex& zy, int& incy) +{ + zswap_(&n, &zx, &incx, &zy, &incy); +} -void csrot_(int *n, std::complex *zx, int *incx, std::complex *zy, - int *incy, float *c, float *s); +static void call_swap(int& n, float& zx, int& incx, float& zy, int& incy) +{ + sswap_(&n, &zx, &incx, &zy, &incy); +} -void drot_(int *n, double *dx, int *incx, double *dy, int *incy, double *c, - double *s); +static void call_swap(int& n, double& zx, int& incx, double& zy, int& incy) +{ + dswap_(&n, &zx, &incx, &zy, &incy); +} -void srot_(int *n, float *dx, int *incx, float *dy, int *incy, float *c, - float *s); +static void call_las2(double& f, double& g, double& h, double& ssmin, double& ssmax) +{ + dlas2_(&f, &g, &h, &ssmin, &ssmax); +} -void zdscal_(int *n, double *da, std::complex *zx, int *incx); -void csscal_(int *n, float *da, std::complex *zx, int *incx); -void dscal_(int *n, double *da, double *zx, int *incx); -void sscal_(int *n, float *da, float *zx, int *incx); +static void call_las2(float& f, float& g, float& h, float& ssmin, float& ssmax) +{ + slas2_(&f, &g, &h, &ssmin, &ssmax); +} -void dlartg_(double *f, double *g, double *c, double *s, double *r); -void slartg_(float *f, float *g, float *c, float *s, float *r); +static void call_lartg(double& f, double& g, double& c, double& s, double& r) +{ + dlartg_(&f, &g, &c, &s, &r); +} -void dlas2_(double *f, double *g, double *h, double *ssmin, double *ssmax); -void slas2_(float *f, float *g, float *h, float *ssmin, float *ssmax); -}; +static void call_lartg(float& f, float& g, float& c, float& s, float& r) +{ + slartg_(&f, &g, &c, &s, &r); +} -static void call_lamch(char &cmach_arg, double &eps) { - char cmach = cmach_arg; - eps = dlamch_(&cmach); +static void call_lartg(std::complex& f, + std::complex& g, + float& c, + std::complex& s, + std::complex& r) +{ + clartg_(&f, &g, &c, &s, &r); } -static void call_lamch(char &cmach_arg, float &eps) { - char cmach = cmach_arg; - eps = slamch_(&cmach); +static void call_lartg(std::complex& f, + std::complex& g, + double& c, + std::complex& s, + std::complex& r) +{ + zlartg_(&f, &g, &c, &s, &r); } -static void call_swap(int &n, std::complex &zx, int &incx, - std::complex &zy, int &incy) { - zswap_(&n, &zx, &incx, &zy, &incy); +static void call_scal(int& n, rocblas_complex_num& da, rocblas_complex_num& zx, int& incx) +{ + cscal_(&n, (std::complex*)&da, (std::complex*)&zx, &incx); } -static void call_swap(int &n, std::complex &zx, int &incx, - std::complex &zy, int &incy) { - cswap_(&n, &zx, &incx, &zy, &incy); +static void call_scal(int& n, std::complex& da, std::complex& zx, int& incx) +{ + cscal_(&n, &da, &zx, &incx); } -static void call_swap(int &n, float &zx, int &incx, float &zy, int &incy) { - sswap_(&n, &zx, &incx, &zy, &incy); +static void + call_scal(int& n, rocblas_complex_num& da, rocblas_complex_num& zx, int& incx) +{ + zscal_(&n, (std::complex*)&da, (std::complex*)&zx, &incx); } -static void call_swap(int &n, double &zx, int &incx, double &zy, int &incy) { - dswap_(&n, &zx, &incx, &zy, &incy); +static void call_scal(int& n, std::complex& da, std::complex& zx, int& incx) +{ + zscal_(&n, &da, &zx, &incx); } -static void call_las2(double &f, double &g, double &h, double &ssmin, - double &ssmax) { - dlas2_(&f, &g, &h, &ssmin, &ssmax); +static void call_scal(int& n, double& da, rocblas_complex_num& zx, int& incx) +{ + zdscal_(&n, &da, (std::complex*)&zx, &incx); } -static void call_las2(float &f, float &g, float &h, float &ssmin, - float &ssmax) { - slas2_(&f, &g, &h, &ssmin, &ssmax); +static void call_scal(int& n, double& da, std::complex& zx, int& incx) +{ + zdscal_(&n, &da, &zx, &incx); } -static void call_lartg(double &f, double &g, double &c, double &s, double &r) { - dlartg_(&f, &g, &c, &s, &r); +static void call_scal(int& n, float& da, rocblas_complex_num& zx, int& incx) +{ + csscal_(&n, &da, (std::complex*)&zx, &incx); } -static void call_lartg(float &f, float &g, float &c, float &s, float &r) { - slartg_(&f, &g, &c, &s, &r); +static void call_scal(int& n, float& da, std::complex& zx, int& incx) +{ + csscal_(&n, &da, &zx, &incx); } -static void call_scal(int &n, double &da, std::complex &zx, int &incx) { - zdscal_(&n, &da, &zx, &incx); +static void call_scal(int& n, double& da, double& zx, int& incx) +{ + dscal_(&n, &da, &zx, &incx); } -static void call_scal(int &n, float &da, std::complex &zx, int &incx) { - csscal_(&n, &da, &zx, &incx); +static void call_scal(int& n, float& da, float& zx, int& incx) +{ + sscal_(&n, &da, &zx, &incx); } -static void call_scal(int &n, double &da, double &zx, int &incx) { - dscal_(&n, &da, &zx, &incx); +static void call_rot(int& n, + std::complex& zx, + int& incx, + std::complex& zy, + int& incy, + float& c, + float& s) +{ + csrot_(&n, &zx, &incx, &zy, &incy, &c, &s); } -static void call_scal(int &n, float &da, float &zx, int &incx) { - sscal_(&n, &da, &zx, &incx); +static void call_rot(int& n, + rocblas_complex_num& zx, + int& incx, + rocblas_complex_num& zy, + int& incy, + float& c, + float& s) +{ + csrot_(&n, (std::complex*)&zx, &incx, (std::complex*)&zy, &incy, &c, &s); } -static void call_rot(int &n, std::complex &zx, int &incx, - std::complex &zy, int &incy, double &c, - double &s) { - zdrot_(&n, &zx, &incx, &zy, &incy, &c, &s); +static void call_rot(int& n, + std::complex& zx, + int& incx, + std::complex& zy, + int& incy, + double& c, + double& s) +{ + zdrot_(&n, &zx, &incx, &zy, &incy, &c, &s); } -static void call_rot(int &n, std::complex &zx, int &incx, - std::complex &zy, int &incy, float &c, float &s) { - csrot_(&n, &zx, &incx, &zy, &incy, &c, &s); +static void call_rot(int& n, + rocblas_complex_num& zx, + int& incx, + rocblas_complex_num& zy, + int& incy, + double& c, + double& s) +{ + zdrot_(&n, (std::complex*)&zx, &incx, (std::complex*)&zy, &incy, &c, &s); } -static void call_rot(int &n, double &dx, int &incx, double &dy, int &incy, - double &c, double &s) { - drot_(&n, &dx, &incx, &dy, &incy, &c, &s); +static void call_rot(int& n, double& dx, int& incx, double& dy, int& incy, double& c, double& s) +{ + drot_(&n, &dx, &incx, &dy, &incy, &c, &s); } -static void call_rot(int &n, float &dx, int &incx, float &dy, int &incy, - float &c, float &s) { - srot_(&n, &dx, &incx, &dy, &incy, &c, &s); +static void call_rot(int& n, float& dx, int& incx, float& dy, int& incy, float& c, float& s) +{ + srot_(&n, &dx, &incx, &dy, &incy, &c, &s); } -static void call_lasv2(double &f, double &g, double &h, double &ssmin, - double &ssmax, double &snr, double &csr, double &snl, - double &csl) { - dlasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl); +static void call_lasv2(double& f, + double& g, + double& h, + double& ssmin, + double& ssmax, + double& snr, + double& csr, + double& snl, + double& csl) +{ + dlasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl); } -static void call_lasv2(float &f, float &g, float &h, float &ssmin, float &ssmax, - float &snr, float &csr, float &snl, float &csl) { - slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl); +static void call_lasv2(float& f, + float& g, + float& h, + float& ssmin, + float& ssmax, + float& snr, + float& csr, + float& snl, + float& csl) +{ + slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl); } -static void call_lasq1(int &n, double &D_, double &E_, double &rwork_, - int &info_arg) { - dlasq1_(&n, &D_, &E_, &rwork_, &info_arg); +static void call_lasq1(int& n, double& D_, double& E_, double& rwork_, int& info_arg) +{ + dlasq1_(&n, &D_, &E_, &rwork_, &info_arg); +}; + +static void call_lasq1(int& n, float& D_, float& E_, float& rwork_, int& info_arg) +{ + slasq1_(&n, &D_, &E_, &rwork_, &info_arg); +}; + +static void call_lasr(char& side, + char& pivot, + char& direct, + int& m, + int& n, + float& c, + float& s, + rocblas_complex_num& A, + int& lda) +{ + clasr_(&side, &pivot, &direct, &m, &n, &c, &s, (std::complex*)&A, &lda); }; -static void call_lasq1(int &n, float &D_, float &E_, float &rwork_, - int &info_arg) { - slasq1_(&n, &D_, &E_, &rwork_, &info_arg); +static void call_lasr(char& side, + char& pivot, + char& direct, + int& m, + int& n, + double& c, + double& s, + rocblas_complex_num& A, + int& lda) +{ + zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, (std::complex*)&A, &lda); }; -static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n, - double &c, double &s, std::complex &A, int &lda) { - zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); +static void call_lasr(char& side, + char& pivot, + char& direct, + int& m, + int& n, + double& c, + double& s, + std::complex& A, + int& lda) +{ + zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); }; -static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n, - float &c, float &s, std::complex &A, int &lda) { - clasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); +static void call_lasr(char& side, + char& pivot, + char& direct, + int& m, + int& n, + float& c, + float& s, + std::complex& A, + int& lda) +{ + clasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); }; -static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n, - float &c, float &s, float &A, int &lda) { - slasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); +static void + call_lasr(char& side, char& pivot, char& direct, int& m, int& n, float& c, float& s, float& A, int& lda) +{ + slasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); }; -static void call_lasr(char &side, char &pivot, char &direct, int &m, int &n, - double &c, double &s, double &A, int &lda) { - dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); +static void call_lasr(char& side, + char& pivot, + char& direct, + int& m, + int& n, + double& c, + double& s, + double& A, + int& lda) +{ + dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); }; template -static void bdsqr_single_template(char uplo, I n, +static void bdsqr_single_template(char uplo, + I n, #ifdef USE_VT - I ncvt, + I ncvt, #else - I ncv, + I ncv, #endif - I nru, I ncc, T *d_, - T *e_, + I nru, + I ncc, + S* d_, + S* e_, #ifdef USE_VT - T *vt_, I ldvt, + T* vt_, + I ldvt, #else - T *v_, I ldv, + T* v_, + I ldv, #endif - T *u_, I ldu, T *c_, - int ldc, S *work_, I &info, - S *dwork = nullptr, hipStream_t stream = 0) { - - bool const use_gpu = (dwork != nullptr); - - S const zero = 0; - S const one = 1; - S negone = -1; - S const hndrd = 100; - S const hndrth = one / hndrd; - S const ten = 10; - S const eight = 8; - S const meight = -one / eight; - I const maxitr = 6; - I ione = 1; - - I nrv = n; - - bool const lower = (uplo == 'L') || (uplo == 'l'); - bool const upper = (uplo == 'U') || (uplo == 'u'); - /* + T* u_, + I ldu, + T* c_, + int ldc, + S* work_, + I& info, + S* dwork = nullptr, + hipStream_t stream = 0) +{ + bool const use_gpu = (dwork != nullptr); + bool constexpr need_sort = false; + + S const zero = 0; + S const one = 1; + S negone = -1; + S const hndrd = 100; + S const hndrth = one / hndrd; + S const ten = 10; + S const eight = 8; + S const meight = -one / eight; + I const maxitr = 6; + I ione = 1; + + I nrv = n; + + bool const lower = (uplo == 'L') || (uplo == 'l'); + bool const upper = (uplo == 'U') || (uplo == 'u'); + /* * rotate is true if any singular vectors desired, false otherwise */ #ifdef USE_VT - bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0); + bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0); #else - bool const rotate = (nrv > 0) || (nru > 0) || (ncc > 0); + bool const rotate = (nrv > 0) || (nru > 0) || (ncc > 0); #endif - I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, - m = 0, maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0; + I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0, + maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0; - I const nrc = n; // number of rows in C matrix - I const nrvt = n; // number of rows in VT matrix - I const ncu = n; // number of columns in U matrix + I const nrc = n; // number of rows in C matrix + I const nrvt = n; // number of rows in VT matrix + I const ncu = n; // number of columns in U matrix - S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, - h = 0, mu = 0, oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, - sinl = 0, sinr = 0, sll = 0, smax = 0, smin = 0, sminl = 0, sminoa = 0, - sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0; + S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, h = 0, mu = 0, + oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, sinl = 0, sinr = 0, sll = 0, + smax = 0, smin = 0, sminl = 0, sminoa = 0, sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0; - /* .. + /* .. * .. external functions .. logical lsame double precision dlamch @@ -873,210 +1160,217 @@ static void bdsqr_single_template(char uplo, I n, intrinsic abs, dble, max, min, sign, sqrt */ - auto call_swap_gpu = [=](I n, T &x, I incx, T &y, I incy) { - swap_template(n, &x, incx, &y, incy, stream); - }; + auto call_swap_gpu = [=](I n, T& x, I incx, T& y, I incy) { + swap_template(n, &x, incx, &y, incy, stream); + }; + + auto call_rot_gpu = [=](I n, T& x, I incx, T& y, I incy, S cosl, S sinl) { + rot_template(n, &x, incx, &y, incy, cosl, sinl, stream); + }; - auto call_rot_gpu = [=](I n, T &x, I incx, T &y, I incy, S cosl, S sinl) { - rot_template(n, &x, incx, &y, incy, cosl, sinl, stream); - }; + auto call_scal_gpu + = [=](I n, auto da, T& x, I incx) { scal_template(n, da, &x, incx, stream); }; + + auto call_lasr_gpu + = [=](char const side, char const pivot, char const direct, I const m, I const n, S& c, + S& s, T& A, I const lda, S* const dwork, hipStream_t stream) { + bool const is_left_side = (side == 'L') || (side == 'l'); + auto const mn = (is_left_side) ? m : n; + auto const mn_m1 = (mn - 1); + S* const dc = dwork; + S* const ds = dwork + mn_m1; + HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); + + lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream); + HIP_CHECK(hipStreamSynchronize(stream)); + }; - auto call_scal_gpu = [=](I n, auto da, T &x, I incx) { - scal_template(n, da, &x, incx, stream); - }; + auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); }; - auto call_lasr_gpu = [=](char const side, char const pivot, char const direct, - I const m, I const n, S &c, S &s, T &A, I const lda, - S *const dwork, hipStream_t stream) { - bool const is_left_side = (side == 'L') || (side == 'l'); - auto const mn = (is_left_side) ? m : n; - auto const mn_m1 = (mn - 1); - S *const dc = dwork; - S *const ds = dwork + mn_m1; - HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, - stream)); - HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, - stream)); - - lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream); - HIP_CHECK(hipStreamSynchronize(stream)); - }; - - auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); }; - - auto indx2f = [](auto i, auto j, auto ld) -> int64_t { - assert((1 <= i) && (i <= ld)); - assert((1 <= j)); - return ((i - 1) + (j - 1) * int64_t(ld)); - }; - - auto d = [=](auto i) -> T & { - assert((1 <= i) && (i <= n)); - return (d_[i - 1]); - }; - - auto e = [=](auto i) -> T & { - assert((1 <= i) && (i <= (n - 1))); - return (e_[i - 1]); - }; - auto work = [=](auto i) -> S & { return (work_[i - 1]); }; - - auto c = [=](auto i, auto j) -> T & { - assert((1 <= i) && (i <= nrc) && (nrc <= ldc)); - assert((1 <= j) && (j <= ncc)); - return (c_[indx2f(i, j, ldc)]); - }; - - auto u = [=](auto i, auto j) -> T & { - assert((1 <= i) && (i <= nru) && (nru <= ldu)); - assert((1 <= j) && (j <= ncu)); - return (u_[indx2f(i, j, ldu)]); - }; + auto indx2f = [](auto i, auto j, auto ld) -> int64_t { + assert((1 <= i) && (i <= ld)); + assert((1 <= j)); + return ((i - 1) + (j - 1) * int64_t(ld)); + }; + + auto d = [=](auto i) -> S& { + assert((1 <= i) && (i <= n)); + return (d_[i - 1]); + }; + + auto e = [=](auto i) -> S& { + assert((1 <= i) && (i <= (n - 1))); + return (e_[i - 1]); + }; + auto work = [=](auto i) -> S& { return (work_[i - 1]); }; + + auto c = [=](auto i, auto j) -> T& { + assert((1 <= i) && (i <= nrc) && (nrc <= ldc)); + assert((1 <= j) && (j <= ncc)); + return (c_[indx2f(i, j, ldc)]); + }; + + auto u = [=](auto i, auto j) -> T& { + assert((1 <= i) && (i <= nru) && (nru <= ldu)); + assert((1 <= j) && (j <= ncu)); + return (u_[indx2f(i, j, ldu)]); + }; #ifdef USE_VT - auto vt = [=](auto i, auto j) -> T & { - assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt)); - assert((1 <= j) && (j <= ncvt)); - return (vt_[indx2f(i, j, ldvt)]); - }; + auto vt = [=](auto i, auto j) -> T& { + assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt)); + assert((1 <= j) && (j <= ncvt)); + return (vt_[indx2f(i, j, ldvt)]); + }; #else - auto v = [=](auto i, auto j) -> T & { - assert((1 <= i) && (i <= nrv) && (nrv <= ldv)); - assert((1 <= j) && (j <= ncv)); - return (v_[indx2f(i, j, ldv)]); - }; + auto v = [=](auto i, auto j) -> T& { + assert((1 <= i) && (i <= nrv) && (nrv <= ldv)); + assert((1 <= j) && (j <= ncv)); + return (v_[indx2f(i, j, ldv)]); + }; #endif - // --------------------------- - // emulate Fortran intrinsics - // --------------------------- - auto sign = [](auto a, auto b) { - auto const abs_a = std::abs(a); - return ((b >= 0) ? abs_a : -abs_a); - }; + // --------------------------- + // emulate Fortran intrinsics + // --------------------------- + auto sign = [](auto a, auto b) { + auto const abs_a = std::abs(a); + return ((b >= 0) ? abs_a : -abs_a); + }; - auto dble = [](auto x) { return (static_cast(x)); }; + auto dble = [](auto x) { return (static_cast(x)); }; - auto max = [](auto a, auto b) { return ((a > b) ? a : b); }; + auto max = [](auto a, auto b) { return ((a > b) ? a : b); }; - auto min = [](auto a, auto b) { return ((a < b) ? a : b); }; + auto min = [](auto a, auto b) { return ((a < b) ? a : b); }; - auto sqrt = [](auto x) { return (std::sqrt(x)); }; + auto sqrt = [](auto x) { return (std::sqrt(x)); }; - /* .. + /* .. * .. executable statements .. * * test the input parameters. * */ - info = (!upper) && (!lower) ? -1 - : (n < 0) ? -2 + info = (!upper) && (!lower) ? -1 + : (n < 0) ? -2 #ifdef USE_VT - : (ncvt < 0) ? -3 + : (ncvt < 0) ? -3 #else - : (nrv < 0) ? -3 + : (nrv < 0) ? -3 #endif - : (nru < 0) ? -4 - : (ncc < 0) ? -5 - : (ldu < max(1, nru)) ? -11 - : 0; - - if (info != 0) - return; - - if (n == 0) - return; - if (n == 1) - goto L160; - /* + : (nru < 0) ? -4 + : (ncc < 0) ? -5 + : (ldu < max(1, nru)) ? -11 + : 0; + + if(info != 0) + return; + + if(n == 0) + return; + if(n == 1) + goto L160; + /* * if no singular vectors desired, use qd algorithm */ - if (!rotate) { - call_lasq1(n, d(1), e(1), work(1), info); - /* + if(!rotate) + { + call_lasq1(n, d(1), e(1), work(1), info); + /* * if info equals 2, dqds didn't finish, try to finish */ - if (info != 2) - return; - info = 0; - } + if(info != 2) + return; + info = 0; + } - nm1 = n - 1; - nm12 = nm1 + nm1; - nm13 = nm12 + nm1; - idir = 0; - /* + nm1 = n - 1; + nm12 = nm1 + nm1; + nm13 = nm12 + nm1; + idir = 0; + /* * get machine constants * */ - { - char cmach_eps = 'E'; - char cmach_unfl = 'S'; - call_lamch(cmach_eps, eps); - call_lamch(cmach_unfl, unfl); - } - /* + { + char cmach_eps = 'E'; + char cmach_unfl = 'S'; + call_lamch(cmach_eps, eps); + call_lamch(cmach_unfl, unfl); + } + /* * if matrix lower bidiagonal, rotate to be upper bidiagonal * by applying givens rotations on the left */ - if (lower) { - // do 10 i = 1, n - 1 - for (i = 1; i <= (n - 1); i++) { - call_lartg(d(i), e(i), cs, sn, r); - d(i) = r; - e(i) = sn * d(i + 1); - d(i + 1) = cs * d(i + 1); - work(i) = cs; - work(nm1 + i) = sn; - } - L10: + if(lower) + { + // do 10 i = 1, n - 1 + for(i = 1; i <= (n - 1); i++) + { + call_lartg(d(i), e(i), cs, sn, r); + d(i) = r; + e(i) = sn * d(i + 1); + d(i + 1) = cs * d(i + 1); + work(i) = cs; + work(nm1 + i) = sn; + } + L10: - /* + /* * update singular vectors if desired */ - if (nru > 0) { - // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu ); - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; - if (use_gpu) { - - call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), - ldu, dwork, stream); - } else { - call_lasr(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu); - } - } - if (ncc > 0) { - // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc ); - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), - ldc, dwork, stream); - } else { - call_lasr(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc); - } + if(nru > 0) + { + // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu ); + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu, dwork, + stream); + } + else + { + call_lasr(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc ); + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc, dwork, + stream); + } + else + { + call_lasr(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc); + } + } } - } - /* + /* * compute singular values to relative accuracy tol * (by setting tol to be negative, algorithm will compute * singular values to absolute accuracy abs(tol)*norm(input matrix)) */ - tolmul = max(ten, min(hndrd, pow(eps, meight))); - tol = tolmul * eps; + tolmul = max(ten, min(hndrd, pow(eps, meight))); + tol = tolmul * eps; - /* + /* * compute approximate maximum, minimum singular values */ - /* + /* smax = zero do 20 i = 1, n smax = max( smax, abs( d( i ) ) ) @@ -1085,993 +1379,1151 @@ static void bdsqr_single_template(char uplo, I n, smax = max( smax, abs( e( i ) ) ) L30: */ - smax = zero; - // do 20 i = 1, n - for (i = 1; i <= n; i++) { - smax = max(smax, abs(d(i))); - } + smax = zero; + // do 20 i = 1, n + for(i = 1; i <= n; i++) + { + smax = max(smax, abs(d(i))); + } L20: - // do 30 i = 1, n - 1 - for (i = 1; i <= (n - 1); i++) { - smax = max(smax, abs(e(i))); - } + // do 30 i = 1, n - 1 + for(i = 1; i <= (n - 1); i++) + { + smax = max(smax, abs(e(i))); + } L30: - sminl = zero; - if (tol >= zero) { - /* + sminl = zero; + if(tol >= zero) + { + /* * relative accuracy desired */ - sminoa = abs(d(1)); - if (sminoa == zero) - goto L50; - mu = sminoa; - // do 40 i = 2, n - for (i = 2; i <= n; i++) { - mu = abs(d(i)) * (mu / (mu + abs(e(i - 1)))); - sminoa = min(sminoa, mu); - if (sminoa == zero) - goto L50; - } - L40: - L50: + sminoa = abs(d(1)); + if(sminoa == zero) + goto L50; + mu = sminoa; + // do 40 i = 2, n + for(i = 2; i <= n; i++) + { + mu = abs(d(i)) * (mu / (mu + abs(e(i - 1)))); + sminoa = min(sminoa, mu); + if(sminoa == zero) + goto L50; + } + L40: + L50: - sminoa = sminoa / sqrt(dble(n)); - thresh = max(tol * sminoa, maxitr * (n * (n * unfl))); - } else { - /* + sminoa = sminoa / sqrt(dble(n)); + thresh = max(tol * sminoa, maxitr * (n * (n * unfl))); + } + else + { + /* * absolute accuracy desired */ - thresh = max(abs(tol) * smax, maxitr * (n * (n * unfl))); - } - /* + thresh = max(abs(tol) * smax, maxitr * (n * (n * unfl))); + } + /* * prepare for main iteration loop for the singular values * (maxit is the maximum number of passes through the inner * loop permitted before nonconvergence signalled.) */ - maxitdivn = maxitr * n; - iterdivn = 0; - iter = -1; - oldll = -1; - oldm = -1; - /* + maxitdivn = maxitr * n; + iterdivn = 0; + iter = -1; + oldll = -1; + oldm = -1; + /* * m points to last element of unconverged part of matrix */ - m = n; - /* + m = n; + /* * begin main iteration loop */ L60: - /* + /* * check for convergence or exceeding iteration count */ - if (m <= 1) - goto L160; - - if (iter >= n) { - iter = iter - n; - iterdivn = iterdivn + 1; - if (iterdivn >= maxitdivn) - goto L200; - } - /* + if(m <= 1) + goto L160; + + if(iter >= n) + { + iter = iter - n; + iterdivn = iterdivn + 1; + if(iterdivn >= maxitdivn) + goto L200; + } + /* * find diagonal block of matrix to work on */ - if (tol < zero && abs(d(m)) <= thresh) - d(m) = zero; - - smax = abs(d(m)); - smin = smax; - // do 70 lll = 1, m - 1 - for (lll = 1; lll <= (m - 1); lll++) { - ll = m - lll; - abss = abs(d(ll)); - abse = abs(e(ll)); - if (tol < zero && abss <= thresh) - d(ll) = zero; - if (abse <= thresh) - goto L80; - smin = min(smin, abss); - smax = max(smax, max(abss, abse)); - } + if(tol < zero && abs(d(m)) <= thresh) + d(m) = zero; + + smax = abs(d(m)); + smin = smax; + // do 70 lll = 1, m - 1 + for(lll = 1; lll <= (m - 1); lll++) + { + ll = m - lll; + abss = abs(d(ll)); + abse = abs(e(ll)); + if(tol < zero && abss <= thresh) + d(ll) = zero; + if(abse <= thresh) + goto L80; + smin = min(smin, abss); + smax = max(smax, max(abss, abse)); + } L70: - ll = 0; - goto L90; + ll = 0; + goto L90; L80: - e(ll) = zero; - /* + e(ll) = zero; + /* * matrix splits since e(ll) = 0 */ - if (ll == m - 1) { - /* + if(ll == m - 1) + { + /* * convergence of bottom singular value, return to top of loop */ - m = m - 1; - goto L60; - } + m = m - 1; + goto L60; + } L90: - ll = ll + 1; - /* + ll = ll + 1; + /* * e(ll) through e(m-1) are nonzero, e(ll-1) is zero */ - if (ll == m - 1) { - /* + if(ll == m - 1) + { + /* * 2 by 2 block, handle separately */ - call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl); - d(m - 1) = sigmx; - e(m - 1) = zero; - d(m) = sigmn; - /* + call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl); + d(m - 1) = sigmx; + e(m - 1) = zero; + d(m) = sigmn; + /* * compute singular vectors, if desired */ #ifdef USE_VT - if (ncvt > 0) { - if (use_gpu) { - call_rot_gpu(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr); - } else { - call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr); - } - } + if(ncvt > 0) + { + if(use_gpu) + { + call_rot_gpu(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr); + } + else + { + call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr); + } + } #else - if (nrv > 0) { - if (use_gpu) { - call_rot_gpu(nrv, v(1,m - 1), 1, v(1,m), 1, cosr, sinr); - } else { - call_rot(nrv, v(1,m - 1), 1, v(1,m), 1, cosr, sinr); - } - } + if(nrv > 0) + { + if(use_gpu) + { + call_rot_gpu(nrv, v(1, m - 1), 1, v(1, m), 1, cosr, sinr); + } + else + { + call_rot(nrv, v(1, m - 1), 1, v(1, m), 1, cosr, sinr); + } + } #endif - if (nru > 0) { - if (use_gpu) { - call_rot_gpu(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl); - } else { - call_rot(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl); - } - } - if (ncc > 0) { - if (use_gpu) { - call_rot_gpu(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl); - } else { - call_rot(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl); - } + if(nru > 0) + { + if(use_gpu) + { + call_rot_gpu(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl); + } + else + { + call_rot(nru, u(1, m - 1), ione, u(1, m), ione, cosl, sinl); + } + } + if(ncc > 0) + { + if(use_gpu) + { + call_rot_gpu(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl); + } + else + { + call_rot(ncc, c(m - 1, 1), ldc, c(m, 1), ldc, cosl, sinl); + } + } + m = m - 2; + goto L60; } - m = m - 2; - goto L60; - } - /* + /* * if working on new submatrix, choose shift direction * (from larger end diagonal element towards smaller) */ - if (ll > oldm || m < oldll) { - if (abs(d(ll)) >= abs(d(m))) { - /* + if(ll > oldm || m < oldll) + { + if(abs(d(ll)) >= abs(d(m))) + { + /* * chase bulge from top (big end) to bottom (small end) */ - idir = 1; - } else { - /* + idir = 1; + } + else + { + /* * chase bulge from bottom (big end) to top (small end) */ - idir = 2; + idir = 2; + } } - } - /* + /* * apply convergence tests */ - if (idir == 1) { - /* + if(idir == 1) + { + /* * run convergence test in forward direction * first apply standard test to bottom of matrix */ - if (abs(e(m - 1)) <= abs(tol) * abs(d(m)) || - (tol < zero && abs(e(m - 1)) <= thresh)) { - e(m - 1) = zero; - goto L60; - } + if(abs(e(m - 1)) <= abs(tol) * abs(d(m)) || (tol < zero && abs(e(m - 1)) <= thresh)) + { + e(m - 1) = zero; + goto L60; + } - if (tol >= zero) { - /* + if(tol >= zero) + { + /* * if relative accuracy desired, * apply convergence criterion forward */ - mu = abs(d(ll)); - sminl = mu; - // do 100 lll = ll, m - 1 - for (lll = ll; lll <= (m - 1); lll++) { - if (abs(e(lll)) <= tol * mu) { - e(lll) = zero; - goto L60; + mu = abs(d(ll)); + sminl = mu; + // do 100 lll = ll, m - 1 + for(lll = ll; lll <= (m - 1); lll++) + { + if(abs(e(lll)) <= tol * mu) + { + e(lll) = zero; + goto L60; + } + mu = abs(d(lll + 1)) * (mu / (mu + abs(e(lll)))); + sminl = min(sminl, mu); + } + // L100: } - mu = abs(d(lll + 1)) * (mu / (mu + abs(e(lll)))); - sminl = min(sminl, mu); - } - // L100: } - - } else { - /* + else + { + /* * run convergence test in backward direction * first apply standard test to top of matrix */ - if (abs(e(ll)) <= abs(tol) * abs(d(ll)) || - (tol < zero && abs(e(ll)) <= thresh)) { - e(ll) = zero; - goto L60; - } + if(abs(e(ll)) <= abs(tol) * abs(d(ll)) || (tol < zero && abs(e(ll)) <= thresh)) + { + e(ll) = zero; + goto L60; + } - if (tol >= zero) { - /* + if(tol >= zero) + { + /* * if relative accuracy desired, * apply convergence criterion backward */ - mu = abs(d(m)); - sminl = mu; - // do 110 lll = m - 1, ll, -1 - for (lll = (m - 1); lll >= ll; lll--) { - if (abs(e(lll)) <= tol * mu) { - e(lll) = zero; - goto L60; + mu = abs(d(m)); + sminl = mu; + // do 110 lll = m - 1, ll, -1 + for(lll = (m - 1); lll >= ll; lll--) + { + if(abs(e(lll)) <= tol * mu) + { + e(lll) = zero; + goto L60; + } + mu = abs(d(lll)) * (mu / (mu + abs(e(lll)))); + sminl = min(sminl, mu); + } + // L110: } - mu = abs(d(lll)) * (mu / (mu + abs(e(lll)))); - sminl = min(sminl, mu); - } - // L110: } - } - oldll = ll; - oldm = m; - /* + oldll = ll; + oldm = m; + /* * compute shift. first, test if shifting would ruin relative * accuracy, and if so set the shift to zero. */ - if (tol >= zero && n * tol * (sminl / smax) <= max(eps, hndrth * tol)) { - /* + if(tol >= zero && n * tol * (sminl / smax) <= max(eps, hndrth * tol)) + { + /* * use a zero shift to avoid loss of relative accuracy */ - shift = zero; - } else { - /* + shift = zero; + } + else + { + /* * compute the shift from 2-by-2 block at end of matrix */ - if (idir == 1) { - sll = abs(d(ll)); - call_las2(d(m - 1), e(m - 1), d(m), shift, r); - } else { - sll = abs(d(m)); - call_las2(d(ll), e(ll), d(ll + 1), shift, r); - } - /* + if(idir == 1) + { + sll = abs(d(ll)); + call_las2(d(m - 1), e(m - 1), d(m), shift, r); + } + else + { + sll = abs(d(m)); + call_las2(d(ll), e(ll), d(ll + 1), shift, r); + } + /* * test if shift negligible, and if so set to zero */ - if (sll > zero) { - if ((shift / sll) * (shift / sll) < eps) - shift = zero; + if(sll > zero) + { + if((shift / sll) * (shift / sll) < eps) + shift = zero; + } } - } - /* + /* * increment iteration count */ - iter = iter + m - ll; - /* + iter = iter + m - ll; + /* * if shift = 0, do simplified qr iteration */ - if (shift == zero) { - if (idir == 1) { - /* + if(shift == zero) + { + if(idir == 1) + { + /* * chase bulge from top to bottom * save cosines and sines for later singular vector updates */ - cs = one; - oldcs = one; - // do 120 i = ll, m - 1 - for (i = ll; i <= (m - 1); i++) { - auto di_cs = d(i) * cs; - call_lartg(di_cs, e(i), cs, sn, r); - if (i > ll) - e(i - 1) = oldsn * r; - auto oldcs_r = oldcs * r; - auto dip1_sn = d(i + 1) * sn; - call_lartg(oldcs_r, dip1_sn, oldcs, oldsn, d(i)); - work(i - ll + 1) = cs; - work(i - ll + 1 + nm1) = sn; - work(i - ll + 1 + nm12) = oldcs; - work(i - ll + 1 + nm13) = oldsn; - } - L120: - h = d(m) * cs; - d(m) = h * oldcs; - e(m - 1) = h * oldsn; - /* + cs = one; + oldcs = one; + // do 120 i = ll, m - 1 + for(i = ll; i <= (m - 1); i++) + { + auto di_cs = d(i) * cs; + call_lartg(di_cs, e(i), cs, sn, r); + if(i > ll) + e(i - 1) = oldsn * r; + auto oldcs_r = oldcs * r; + auto dip1_sn = d(i + 1) * sn; + call_lartg(oldcs_r, dip1_sn, oldcs, oldsn, d(i)); + work(i - ll + 1) = cs; + work(i - ll + 1 + nm1) = sn; + work(i - ll + 1 + nm12) = oldcs; + work(i - ll + 1 + nm13) = oldsn; + } + L120: + h = d(m) * cs; + d(m) = h * oldcs; + e(m - 1) = h * oldsn; + /* * update singular vectors */ #ifdef USE_VT - if (ncvt > 0) { - // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( - // ll, 1 ), ldvt ) - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), - vt(ll, 1), ldvt, dwork, stream); - } else { - call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), - ldvt); - } - } + if(ncvt > 0) + { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( + // ll, 1 ), ldvt ) + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt, + dwork, stream); + } + else + { + call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt); + } + } #else - if (nrv > 0) { - // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), - // v(1,ll ), ldv) - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), - v(1,ll), ldv, dwork, stream); - } else { - call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1,ll), - ldv); - } - } + if(nrv > 0) + { + // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), + // v(1,ll ), ldv) + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv, + dwork, stream); + } + else + { + call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv); + } + } #endif - if (nru > 0) { - // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 - // ), u( 1, ll ), ldu ) - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), - work(nm13 + 1), u(1, ll), ldu, dwork, stream); - } else { - call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), - work(nm13 + 1), u(1, ll), ldu); - } - } - if (ncc > 0) { - // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 - // ), c( ll, 1 ), ldc ) - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), - work(nm13 + 1), c(ll, 1), ldc, dwork, stream); - } else { - call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), - work(nm13 + 1), c(ll, 1), ldc); - } - } - /* + if(nru > 0) + { + // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 + // ), u( 1, ll ), ldu ) + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu, dwork, stream); + } + else + { + call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 + // ), c( ll, 1 ), ldc ) + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc, dwork, stream); + } + else + { + call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc); + } + } + /* * test convergence */ - if (abs(e(m - 1)) <= thresh) - e(m - 1) = zero; - - } else { - /* + if(abs(e(m - 1)) <= thresh) + e(m - 1) = zero; + } + else + { + /* * chase bulge from bottom to top * save cosines and sines for later singular vector updates */ - cs = one; - oldcs = one; - // do 130 i = m, ll + 1, -1 - for (i = m; i >= (ll + 1); i--) { - auto di_cs = d(i) * cs; - call_lartg(di_cs, e(i - 1), cs, sn, r); - - if (i < m) - e(i) = oldsn * r; - - auto oldcs_r = oldcs * r; - auto dim1_sn = d(i - 1) * sn; - call_lartg(oldcs_r, dim1_sn, oldcs, oldsn, d(i)); - - work(i - ll) = cs; - work(i - ll + nm1) = -sn; - work(i - ll + nm12) = oldcs; - work(i - ll + nm13) = -oldsn; - } - L130: - h = d(ll) * cs; - d(ll) = h * oldcs; - e(ll) = h * oldsn; - /* + cs = one; + oldcs = one; + // do 130 i = m, ll + 1, -1 + for(i = m; i >= (ll + 1); i--) + { + auto di_cs = d(i) * cs; + call_lartg(di_cs, e(i - 1), cs, sn, r); + + if(i < m) + e(i) = oldsn * r; + + auto oldcs_r = oldcs * r; + auto dim1_sn = d(i - 1) * sn; + call_lartg(oldcs_r, dim1_sn, oldcs, oldsn, d(i)); + + work(i - ll) = cs; + work(i - ll + nm1) = -sn; + work(i - ll + nm12) = oldcs; + work(i - ll + nm13) = -oldsn; + } + L130: + h = d(ll) * cs; + d(ll) = h * oldcs; + e(ll) = h * oldsn; + /* * update singular vectors */ #ifdef USE_VT - if (ncvt > 0) { - // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( - // nm13+1 - // ), vt( ll, 1 ), ldvt ); - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), - work(nm13 + 1), vt(ll, 1), ldvt, dwork, stream); - } else { - call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), - work(nm13 + 1), vt(ll, 1), ldvt); - } - } + if(ncvt > 0) + { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( + // nm13+1 + // ), vt( ll, 1 ), ldvt ); + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt, dwork, stream); + } + else + { + call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt); + } + } #else - if (nrv > 0) { - // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv ); - char side = 'R'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), - work(nm13 + 1), v(1,ll), ldv, dwork, stream); - } else { - call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), - work(nm13 + 1), v(1,ll), ldv); - } - } - + if(nrv > 0) + { + // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv ); + char side = 'R'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1), + v(1, ll), ldv, dwork, stream); + } + else + { + call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1), + v(1, ll), ldv); + } + } #endif - if (nru > 0) { - // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, - // ll - // ), ldu ) - char side = 'R'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), - u(1, ll), ldu, dwork, stream); - } else { - call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), - ldu); - } - } - if (ncc > 0) { - // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, - // 1 - // ), ldc ) - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), - c(ll, 1), ldc, dwork, stream); - } else { - call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), - ldc); - } - } - /* + if(nru > 0) + { + // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, + // ll + // ), ldu ) + char side = 'R'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu, + dwork, stream); + } + else + { + call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, + // 1 + // ), ldc ) + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc, + dwork, stream); + } + else + { + call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc); + } + } + /* * test convergence */ - if (abs(e(ll)) <= thresh) - e(ll) = zero; + if(abs(e(ll)) <= thresh) + e(ll) = zero; + } } - } else { - /* + else + { + /* * use nonzero shift */ - if (idir == 1) { - /* + if(idir == 1) + { + /* * chase bulge from top to bottom * save cosines and sines for later singular vector updates */ - f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll)); - g = e(ll); - // do 140 i = ll, m - 1 - for (i = ll; i <= (m - 1); i++) { - call_lartg(f, g, cosr, sinr, r); - if (i > ll) - e(i - 1) = r; - f = cosr * d(i) + sinr * e(i); - e(i) = cosr * e(i) - sinr * d(i); - g = sinr * d(i + 1); - d(i + 1) = cosr * d(i + 1); - call_lartg(f, g, cosl, sinl, r); - d(i) = r; - f = cosl * e(i) + sinl * d(i + 1); - d(i + 1) = cosl * d(i + 1) - sinl * e(i); - if (i < m - 1) { - g = sinl * e(i + 1); - e(i + 1) = cosl * e(i + 1); - } - work(i - ll + 1) = cosr; - work(i - ll + 1 + nm1) = sinr; - work(i - ll + 1 + nm12) = cosl; - work(i - ll + 1 + nm13) = sinl; - } - L140: - e(m - 1) = f; - /* + f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll)); + g = e(ll); + // do 140 i = ll, m - 1 + for(i = ll; i <= (m - 1); i++) + { + call_lartg(f, g, cosr, sinr, r); + if(i > ll) + e(i - 1) = r; + f = cosr * d(i) + sinr * e(i); + e(i) = cosr * e(i) - sinr * d(i); + g = sinr * d(i + 1); + d(i + 1) = cosr * d(i + 1); + call_lartg(f, g, cosl, sinl, r); + d(i) = r; + f = cosl * e(i) + sinl * d(i + 1); + d(i + 1) = cosl * d(i + 1) - sinl * e(i); + if(i < m - 1) + { + g = sinl * e(i + 1); + e(i + 1) = cosl * e(i + 1); + } + work(i - ll + 1) = cosr; + work(i - ll + 1 + nm1) = sinr; + work(i - ll + 1 + nm12) = cosl; + work(i - ll + 1 + nm13) = sinl; + } + L140: + e(m - 1) = f; + /* * update singular vectors */ #ifdef USE_VT - if (ncvt > 0) { - // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( - // ll, 1 ), ldvt ) - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), - vt(ll, 1), ldvt, dwork, stream); - } else { - call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), - ldvt); - } - } + if(ncvt > 0) + { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( + // ll, 1 ), ldvt ) + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt, + dwork, stream); + } + else + { + call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt); + } + } #else - if (nrv > 0) { - // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), v( 1, ll ), ldv ) - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), - v(1,ll), ldv, dwork, stream); - } else { - call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1,ll), - ldv); - } - } + if(nrv > 0) + { + // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), v( 1, ll ), ldv ) + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv, + dwork, stream); + } + else + { + call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv); + } + } #endif - if (nru > 0) { - // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 - // ), u( 1, ll ), ldu ) - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), - work(nm13 + 1), u(1, ll), ldu, dwork, stream); - } else { - call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), - work(nm13 + 1), u(1, ll), ldu); - } - } - if (ncc > 0) { - // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 - // ), c( ll, 1 ), ldc ) - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), - work(nm13 + 1), c(ll, 1), ldc, dwork, stream); - } else { - call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), - work(nm13 + 1), c(ll, 1), ldc); - } - } - /* + if(nru > 0) + { + // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 + // ), u( 1, ll ), ldu ) + char side = 'R'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu, dwork, stream); + } + else + { + call_lasr(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 + // ), c( ll, 1 ), ldc ) + char side = 'L'; + char pivot = 'V'; + char direct = 'F'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc, dwork, stream); + } + else + { + call_lasr(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc); + } + } + /* * test convergence */ - if (abs(e(m - 1)) <= thresh) - e(m - 1) = zero; - - } else { - /* + if(abs(e(m - 1)) <= thresh) + e(m - 1) = zero; + } + else + { + /* * chase bulge from bottom to top * save cosines and sines for later singular vector updates */ - f = (abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m)); - g = e(m - 1); - // do 150 i = m, ll + 1, -1 - for (i = m; i >= (ll + 1); i--) { - call_lartg(f, g, cosr, sinr, r); - if (i < m) - e(i) = r; - f = cosr * d(i) + sinr * e(i - 1); - e(i - 1) = cosr * e(i - 1) - sinr * d(i); - g = sinr * d(i - 1); - d(i - 1) = cosr * d(i - 1); - call_lartg(f, g, cosl, sinl, r); - d(i) = r; - f = cosl * e(i - 1) + sinl * d(i - 1); - d(i - 1) = cosl * d(i - 1) - sinl * e(i - 1); - if (i > ll + 1) { - g = sinl * e(i - 2); - e(i - 2) = cosl * e(i - 2); - } - work(i - ll) = cosr; - work(i - ll + nm1) = -sinr; - work(i - ll + nm12) = cosl; - work(i - ll + nm13) = -sinl; - } - L150: - e(ll) = f; - /* + f = (abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m)); + g = e(m - 1); + // do 150 i = m, ll + 1, -1 + for(i = m; i >= (ll + 1); i--) + { + call_lartg(f, g, cosr, sinr, r); + if(i < m) + e(i) = r; + f = cosr * d(i) + sinr * e(i - 1); + e(i - 1) = cosr * e(i - 1) - sinr * d(i); + g = sinr * d(i - 1); + d(i - 1) = cosr * d(i - 1); + call_lartg(f, g, cosl, sinl, r); + d(i) = r; + f = cosl * e(i - 1) + sinl * d(i - 1); + d(i - 1) = cosl * d(i - 1) - sinl * e(i - 1); + if(i > ll + 1) + { + g = sinl * e(i - 2); + e(i - 2) = cosl * e(i - 2); + } + work(i - ll) = cosr; + work(i - ll + nm1) = -sinr; + work(i - ll + nm12) = cosl; + work(i - ll + nm13) = -sinl; + } + L150: + e(ll) = f; + /* * test convergence */ - if (abs(e(ll)) <= thresh) - e(ll) = zero; - /* + if(abs(e(ll)) <= thresh) + e(ll) = zero; + /* * update singular vectors if desired */ #ifdef USE_VT - if (ncvt > 0) { - // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( - // nm13+1 - // ), vt( ll, 1 ), ldvt ) - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), - work(nm13 + 1), vt(ll, 1), ldvt, dwork, stream); - } else { - call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), - work(nm13 + 1), vt(ll, 1), ldvt); - } - } + if(ncvt > 0) + { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( + // nm13+1 + // ), vt( ll, 1 ), ldvt ) + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt, dwork, stream); + } + else + { + call_lasr(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt); + } + } #else - if (nrv > 0) { - // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv ) - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), - work(nm13 + 1), v(1,ll), ldv, dwork, stream); - } else { - call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), - work(nm13 + 1), v(1,ll), ldv); - } - } + if(nrv > 0) + { + // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv ) + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1), + v(1, ll), ldv, dwork, stream); + } + else + { + call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1), + v(1, ll), ldv); + } + } #endif - if (nru > 0) { - // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, - // ll - // ), ldu ) - char side = 'R'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), - u(1, ll), ldu, dwork, stream); - } else { - call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), - ldu); - } - } - if (ncc > 0) { - // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, - // 1 - // ), ldc ) - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if (use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), - c(ll, 1), ldc, dwork, stream); - } else { - call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), - ldc); + if(nru > 0) + { + // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, + // ll + // ), ldu ) + char side = 'R'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu, + dwork, stream); + } + else + { + call_lasr(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu); + } + } + if(ncc > 0) + { + // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, + // 1 + // ), ldc ) + char side = 'L'; + char pivot = 'V'; + char direct = 'B'; + auto mm = m - ll + 1; + if(use_gpu) + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc, + dwork, stream); + } + else + { + call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc); + } + } } - } } - } - /* + /* * qr iteration finished, go back and check convergence */ - goto L60; + goto L60; /* * all singular values converged, so make them positive */ L160: - // do 170 i = 1, n - for (i = 1; i <= n; i++) { - if (d(i) < zero) { - d(i) = -d(i); - /* + // do 170 i = 1, n + for(i = 1; i <= n; i++) + { + if(d(i) < zero) + { + d(i) = -d(i); + /* * change sign of singular vectors, if desired */ #ifdef USE_VT - if (ncvt > 0) { - if (use_gpu) { - call_scal_gpu(ncvt, negone, vt(i, 1), ldvt); - } else { - call_scal(ncvt, negone, vt(i, 1), ldvt); - } - } + if(ncvt > 0) + { + if(use_gpu) + { + call_scal_gpu(ncvt, negone, vt(i, 1), ldvt); + } + else + { + call_scal(ncvt, negone, vt(i, 1), ldvt); + } + } #else - if (nrv > 0) { - if (use_gpu) { - call_scal_gpu(nrv, negone, v(1,i), 1); - } else { - call_scal(nrv, negone, v(1,i), 1); - } - } + if(nrv > 0) + { + if(use_gpu) + { + call_scal_gpu(nrv, negone, v(1, i), 1); + } + else + { + call_scal(nrv, negone, v(1, i), 1); + } + } #endif + } } - } L170: - bool const need_sort = false; - if (need_sort) { - /* + if(need_sort) + { + /* * sort the singular values into decreasing order (insertion sort on * singular values, but only one transposition per singular vector) */ - // do 190 i = 1, n - 1 - for (i = 1; i <= (n - 1); i++) { - /* + // do 190 i = 1, n - 1 + for(i = 1; i <= (n - 1); i++) + { + /* * scan for smallest d(i) */ - isub = 1; - smin = d(1); - // do 180 j = 2, n + 1 - i - for (j = 2; j <= (n + 1 - i); j++) { - if (d(j) <= smin) { - isub = j; - smin = d(j); - } - } - L180: - if (isub != n + 1 - i) { - /* + isub = 1; + smin = d(1); + // do 180 j = 2, n + 1 - i + for(j = 2; j <= (n + 1 - i); j++) + { + if(d(j) <= smin) + { + isub = j; + smin = d(j); + } + } + L180: + if(isub != n + 1 - i) + { + /* * swap singular values and vectors */ - d(isub) = d(n + 1 - i); - d(n + 1 - i) = smin; + d(isub) = d(n + 1 - i); + d(n + 1 - i) = smin; #ifdef USE_VT - if (ncvt > 0) { - if (use_gpu) { - call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); - } else { - call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); - } - } + if(ncvt > 0) + { + if(use_gpu) + { + call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + } + else + { + call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + } + } #else - if (nrv > 0) { - if (use_gpu) { - call_swap_gpu(nrv, v(1,isub), 1, v(1,n + 1 - i), 1); - } else { - call_swap(nrv, v(1,isub), 1, v(1,n + 1 - i), 1); - } - } + if(nrv > 0) + { + if(use_gpu) + { + call_swap_gpu(nrv, v(1, isub), ione, v(1, n + 1 - i), ione); + } + else + { + call_swap(nrv, v(1, isub), ione, v(1, n + 1 - i), ione); + } + } #endif - if (nru > 0) { - if (use_gpu) { - call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione); - } else { - call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione); - } - } - if (ncc > 0) { - if (use_gpu) { - call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); - } else { - call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + if(nru > 0) + { + if(use_gpu) + { + call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione); + } + else + { + call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione); + } + } + if(ncc > 0) + { + if(use_gpu) + { + call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + } + else + { + call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + } + } + } } - } - } - } - } // end if (need_sort) + } // end if (need_sort) L190: - goto L220; + goto L220; /* * maximum number of iterations exceeded, failure to converge */ L200: - info = 0; - // do 210 i = 1, n - 1 - for (i = 1; i <= (n - 1); i++) { - if (e(i) != zero) - info = info + 1; - } + info = 0; + // do 210 i = 1, n - 1 + for(i = 1; i <= (n - 1); i++) + { + if(e(i) != zero) + info = info + 1; + } L210: L220: - return; - /* + return; + /* * end of dbdsqr */ } - - - -template +template rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, - const rocblas_fill uplo_in, - const rocblas_int n, - const rocblas_int nv, - const rocblas_int nu, - const rocblas_int nc, - S* D, - const rocblas_stride strideD, - S* E, - const rocblas_stride strideE, - W1 V, - const rocblas_int shiftV, - const rocblas_int ldv, - const rocblas_stride strideV, - W2 U, - const rocblas_int shiftU, - const rocblas_int ldu, - const rocblas_stride strideU, - W3 C, - const rocblas_int shiftC, - const rocblas_int ldc, - const rocblas_stride strideC, - rocblas_int* info_array, - const rocblas_int batch_count, - rocblas_int* splits_map, - S* work) + const rocblas_fill uplo_in, + const I n, + const I nv, + const I nu, + const I nc, + S* D, + const rocblas_stride strideD, + S* E, + const rocblas_stride strideE, + W1 V, + const I shiftV, + const I ldv, + const rocblas_stride strideV, + W2 U, + const I shiftU, + const I ldu, + const rocblas_stride strideU, + W3 C, + const I shiftC, + const I ldc, + const rocblas_stride strideC, + I* info_array, + const I batch_count, + I* splits_map, + S* work) { + // ------------------------- + // copy D into hD, E into hE + // ------------------------- - // ------------------------- - // copy D into hD, E into hE - // ------------------------- - -hipStream_t stream; -rocblas_get_stream(handle, &stream ); - - + hipStream_t stream; + rocblas_get_stream(handle, &stream); -S * hD = nullptr; -S * hE = nullptr; -HIP_CHECK( hipHostMalloc( &hD, sizeof(S) * n * batch_count )); -HIP_CHECK( hipHostMalloc( &hE, sizeof(S) * (n-1) * batch_count )); + S* hD = nullptr; + S* hE = nullptr; + HIP_CHECK(hipHostMalloc(&hD, sizeof(S) * n * batch_count)); -bool const use_single_copy_for_D = (strideD == n); -if (use_single_copy_for_D) { - void * const dst = (void *) &(hD[0]); - void * const src = (void *) D; - size_t const sizeBytes = sizeof(S) * n * batch_count; - hipMemcpyKind const kind = hipMemcpyDeviceToHost; + // ------------------------------------------------------------------ + // Need to double checking whether array E is length n or length (n-1) + // ------------------------------------------------------------------ + HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * (n - 1) * batch_count)); - HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); -} -else { - for(rocblas_int bid = 0; bid < batch_count; bid++) { - void * const dst = (void *) &(hD[ bid * n ]); - void * const src = (void *) D+ bid * strideD; - size_t const sizeBytes = sizeof(S) * n; - hipMemcpyKind const kind = hipMemcpyDeviceToHost; - HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); - } -} - -bool const use_single_copy_for_E = (strideE == (n-1)); -if (use_single_copy_for_E) { - void * const dst = (void *) &(hE[0]); - void * const src = (void *) E; - size_t const sizeBytes = sizeof(S) * (n-1) * batch_count; - hipMemcpyKind const kind = hipMemcpyDeviceToHost; + // ------------------------------------------------- + // transfer arrays D(:) and E(:) from Device to Host + // ------------------------------------------------- - HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); -} -else { - for(rocblas_int bid = 0; bid < batch_count; bid++) { - void * const dst = (void *) &(hE[ bid * (n-1) ]); - void * const src = (void *) E+ bid * strideE; - size_t const sizeBytes = sizeof(S) * (n-1); - hipMemcpyKind const kind = hipMemcpyDeviceToHost; - HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); - } - -} + bool const use_single_copy_for_D = (strideD == n); + if(use_single_copy_for_D) + { + void* const dst = (void*)&(hD[0]); + void* const src = (void*)&(D[0]); + size_t const sizeBytes = sizeof(S) * n * batch_count; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + else + { + for(I bid = 0; bid < batch_count; bid++) + { + void* const dst = (void*)&(hD[bid * n]); + void* const src = (void*)&(D[bid * strideD]); + size_t const sizeBytes = sizeof(S) * n; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + } + bool const use_single_copy_for_E = (strideE == (n - 1)); + if(use_single_copy_for_E) + { + void* const dst = (void*)&(hE[0]); + void* const src = (void*)&(E[0]); + size_t const sizeBytes = sizeof(S) * (n - 1) * batch_count; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + else + { + for(I bid = 0; bid < batch_count; bid++) + { + void* const dst = (void*)&(hE[bid * (n - 1)]); + void* const src = (void*)&(E[bid * strideE]); + size_t const sizeBytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + } - for(rocblas_int bid = 0; bid < batch_count; bid++) { - std::vector hwork( 4 * n ); + for(I bid = 0; bid < batch_count; bid++) + { + std::vector hwork(4 * n); - char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U'; - T *d_ = &(hD[ bid * n ] ); - T *e_ = &(hE[ bid * (n-1) ]); + char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U'; + S* d_ = &(hD[bid * n]); + S* e_ = &(hE[bid * (n - 1)]); - T *v_ = (nv > 0) ? load_ptr_batch(V, bid, shiftV, strideV) : nullptr; - T *u_ = (nu > 0) ? load_ptr_batch(U, bid, shiftU, strideU) : nullptr; - T *c_ = (nc > 0) ? load_ptr_batch(C, bid, shiftC, strideC) : nullptr; - S * work_ = &(hwork[0]); - S * dwork = work; - I info = 0; + T* v_ = (nv > 0) ? load_ptr_batch(V, bid, shiftV, strideV) : nullptr; + T* u_ = (nu > 0) ? load_ptr_batch(U, bid, shiftU, strideU) : nullptr; + T* c_ = (nc > 0) ? load_ptr_batch(C, bid, shiftC, strideC) : nullptr; + S* work_ = &(hwork[0]); + S* dwork = work; + I info = 0; - I ncv = nv; - I nru = nu; - I ncc = nc; + I nru = nu; + I ncc = nc; #ifdef USE_VT - T *vt_ = v_; - I ldvt = ldv; - - I nrv = n; - I ncvt = nrv; - bdsqr_single_template( uplo, n, ncvt, nru, ncc, d_, e_, - vt_,ldvt, - u_,ldu, c_,ldc, work_, info, dwork, stream ); + // ------------------------------------------------------- + // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V + // as input variable + // However, rocsolver bdsqrt accepts variable called "V" and "ldv" + // but is actually holding "VT" + // ------------------------------------------------------- + T* vt_ = v_; + I ldvt = ldv; + + I nrv = n; + I ncvt = nv; + bdsqr_single_template(uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu, c_, ldc, + work_, info, dwork, stream); #else + I ncv = nv; - bdsqr_single_template( uplo, n, ncv, nru, ncc, d_, e_, - v_,ldv, - u_,ldu, c_,ldc, work_, info, dwork, stream ); + bdsqr_single_template(uplo, n, ncv, nru, ncc, d_, e_, v_, ldv, u_, ldu, c_, ldc, + work_, info, dwork, stream); #endif - - - info_array[bid] = info; - } // end for bid + info_array[bid] = info; + } // end for bid + // ------------------------------------------------- + // transfer arrays D(:) and E(:) from host to device + // ------------------------------------------------- -if (use_single_copy_for_D) { - void * const src = (void *) &(hD[0]); - void * const dst = (void *) D; - size_t const sizeBytes = sizeof(S) * n * batch_count; - hipMemcpyKind const kind = hipMemcpyHostToDevice; - - HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); -} -else { - for(rocblas_int bid = 0; bid < batch_count; bid++) { - void * const src = (void *) &(hD[ bid * n ]); - void * const dst = (void *) D+ bid * strideD; - size_t const sizeBytes = sizeof(S) * n; - hipMemcpyKind const kind = hipMemcpyHostToDevice; - HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); - } -} - - -if (use_single_copy_for_E) { - void * const src = (void *) &(hE[0]); - void * const dst = (void *) E; - size_t const sizeBytes = sizeof(S) * (n-1) * batch_count; - hipMemcpyKind const kind = hipMemcpyHostToDevice; + if(use_single_copy_for_D) + { + void* const src = (void*)&(hD[0]); + void* const dst = (void*)D; + size_t const sizeBytes = sizeof(S) * n * batch_count; + hipMemcpyKind const kind = hipMemcpyHostToDevice; - HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); -} -else { - for(rocblas_int bid = 0; bid < batch_count; bid++) { - void * const src = (void *) &(hE[ bid * (n-1) ]); - void * const dst = (void *) E+ bid * strideE; - size_t const sizeBytes = sizeof(S) * (n-1); - hipMemcpyKind const kind = hipMemcpyHostToDevice; - HIP_CHECK( hipMemcpyAsync( dst, src, sizeBytes, kind, stream ) ); - } + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + else + { + for(I bid = 0; bid < batch_count; bid++) + { + void* const src = (void*)&(hD[bid * n]); + void* const dst = (void*)(D + bid * strideD); + size_t const sizeBytes = sizeof(S) * n; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + } -} + if(use_single_copy_for_E) + { + void* const src = (void*)&(hE[0]); + void* const dst = (void*)E; + size_t const sizeBytes = sizeof(S) * (n - 1) * batch_count; + hipMemcpyKind const kind = hipMemcpyHostToDevice; -HIP_CHECK( hipHostFree( hD ) ); hD = nullptr; -HIP_CHECK( hipHostFree( hE ) ); hE = nullptr; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + else + { + for(I bid = 0; bid < batch_count; bid++) + { + void* const src = (void*)&(hE[bid * (n - 1)]); + void* const dst = (void*)(E + bid * strideE); + size_t const sizeBytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); + } + } -return( rocblas_status_success ); + HIP_CHECK(hipHostFree(hD)); + hD = nullptr; + HIP_CHECK(hipHostFree(hE)); + hE = nullptr; + return (rocblas_status_success); } - ROCSOLVER_END_NAMESPACE diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp index 340dfa32a..deb38adb9 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp @@ -38,6 +38,8 @@ #include +#include "bdsqr_host.hpp" + ROCSOLVER_BEGIN_NAMESPACE /************** Kernels and device functions *******************/ @@ -1079,21 +1081,35 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle, ROCSOLVER_LAUNCH_KERNEL((bdsqr_init), grid1, threads1, 0, stream, n, D, strideD, E, strideE, info, maxiter, sfm, tol, splits_map, work, incW, strideW); + bool const use_bdsqr_host = true; + rocblas_status istat = rocblas_status_success; if(n > 1) { - // rotate to upper bidiagonal if necessary - if(uplo == rocblas_fill_lower) + if(use_bdsqr_host) { - ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper), grid1, threads2, 0, stream, n, nu, nc, - D, strideD, E, strideE, U, shiftU, ldu, strideU, C, shiftC, ldc, - strideC, info, work, strideW); + rocblas_fill const uplo_in = uplo; + auto const info_array = info; + istat = rocsolver_bdsqr_host_batch_template( + handle, uplo_in, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U, + shiftU, ldu, strideU, C, shiftC, ldc, strideC, info_array, batch_count, splits_map, + work); } + else + { + // rotate to upper bidiagonal if necessary + if(uplo == rocblas_fill_lower) + { + ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper), grid1, threads2, 0, stream, n, nu, + nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C, + shiftC, ldc, strideC, info, work, strideW); + } - // main computation of SVD - ROCSOLVER_LAUNCH_KERNEL((bdsqr_kernel), grid2, threads3, 0, stream, n, nv, nu, nc, D, - strideD, E, strideE, V, shiftV, ldv, strideV, U, shiftU, ldu, - strideU, C, shiftC, ldc, strideC, info, maxiter, eps, sfm, tol, - minshift, splits_map, work, incW, strideW); + // main computation of SVD + ROCSOLVER_LAUNCH_KERNEL((bdsqr_kernel), grid2, threads3, 0, stream, n, nv, nu, nc, D, + strideD, E, strideE, V, shiftV, ldv, strideV, U, shiftU, ldu, + strideU, C, shiftC, ldc, strideC, info, maxiter, eps, sfm, tol, + minshift, splits_map, work, incW, strideW); + } } // sort the singular values and vectors @@ -1101,7 +1117,7 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle, E, strideE, V, shiftV, ldv, strideV, U, shiftU, ldu, strideU, C, shiftC, ldc, strideC, info, splits_map); - return rocblas_status_success; + return istat; } ROCSOLVER_END_NAMESPACE From 55d0861126d7bb08f122c79854905c7ce17f4e7d Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Thu, 30 May 2024 15:04:58 -0400 Subject: [PATCH 03/35] debug snapshot --- library/src/auxiliary/bdsqr_host.hpp | 75 ++++++++++++++++---- library/src/auxiliary/rocauxiliary_bdsqr.hpp | 4 +- 2 files changed, 63 insertions(+), 16 deletions(-) diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp index 5d3b0f6c9..69b283317 100644 --- a/library/src/auxiliary/bdsqr_host.hpp +++ b/library/src/auxiliary/bdsqr_host.hpp @@ -2365,6 +2365,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, I* splits_map, S* work) { + int const idebug = 1; + // ------------------------- // copy D into hD, E into hE // ------------------------- @@ -2375,11 +2377,26 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, S* hD = nullptr; S* hE = nullptr; HIP_CHECK(hipHostMalloc(&hD, sizeof(S) * n * batch_count)); - // ------------------------------------------------------------------ // Need to double checking whether array E is length n or length (n-1) // ------------------------------------------------------------------ - HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * (n - 1) * batch_count)); + bool const use_single_copy_for_E = (strideE == (n - 1)) || (strideE == (n)); + size_t E_size = use_single_copy_for_E ? strideE : (n - 1); + HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * E_size * batch_count)); + + // ---------------------------------------------------- + // copy info_array[] on device to linfo_array[] on host + // ---------------------------------------------------- + I* linfo_array = nullptr; + HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * batch_count)); + { + void* dst = &(linfo_array[0]); + void* src = info_array; + size_t nbytes = sizeof(I) * batch_count; + hipMemcpyKind const kind = hipMemcpyDeviceToHost; + + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } // ------------------------------------------------- // transfer arrays D(:) and E(:) from Device to Host @@ -2407,12 +2424,11 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } - bool const use_single_copy_for_E = (strideE == (n - 1)); if(use_single_copy_for_E) { void* const dst = (void*)&(hE[0]); void* const src = (void*)&(E[0]); - size_t const sizeBytes = sizeof(S) * (n - 1) * batch_count; + size_t const sizeBytes = sizeof(S) * E_size * batch_count; hipMemcpyKind const kind = hipMemcpyDeviceToHost; HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); @@ -2421,21 +2437,27 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, { for(I bid = 0; bid < batch_count; bid++) { - void* const dst = (void*)&(hE[bid * (n - 1)]); + void* const dst = (void*)&(hE[bid * E_size]); void* const src = (void*)&(E[bid * strideE]); - size_t const sizeBytes = sizeof(S) * (n - 1); + size_t const sizeBytes = sizeof(S) * E_size; hipMemcpyKind const kind = hipMemcpyDeviceToHost; HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); } } + if(idebug >= 1) + { + printf("batch_count = %d\n", batch_count); + printf("n = %d, strideD = %ld, strideE = %ld\n", n, (int64_t)strideD, (int64_t)strideE); + } + for(I bid = 0; bid < batch_count; bid++) { std::vector hwork(4 * n); char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U'; S* d_ = &(hD[bid * n]); - S* e_ = &(hE[bid * (n - 1)]); + S* e_ = &(hE[bid * E_size]); T* v_ = (nv > 0) ? load_ptr_batch(V, bid, shiftV, strideV) : nullptr; T* u_ = (nu > 0) ? load_ptr_batch(U, bid, shiftU, strideU) : nullptr; @@ -2451,8 +2473,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, // ------------------------------------------------------- // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V // as input variable - // However, rocsolver bdsqrt accepts variable called "V" and "ldv" - // but is actually holding "VT" + // However, rocsolver bdsqr() accepts variable called "V" and "ldv" + // but may be actually holding "VT" // ------------------------------------------------------- T* vt_ = v_; I ldvt = ldv; @@ -2469,7 +2491,10 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, #endif - info_array[bid] = info; + if(linfo_array[bid] == 0) + { + linfo_array[bid] = info; + } } // end for bid // ------------------------------------------------- @@ -2501,7 +2526,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, { void* const src = (void*)&(hE[0]); void* const dst = (void*)E; - size_t const sizeBytes = sizeof(S) * (n - 1) * batch_count; + size_t const sizeBytes = sizeof(S) * E_size * batch_count; hipMemcpyKind const kind = hipMemcpyHostToDevice; HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); @@ -2510,9 +2535,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, { for(I bid = 0; bid < batch_count; bid++) { - void* const src = (void*)&(hE[bid * (n - 1)]); + void* const src = (void*)&(hE[bid * E_size]); void* const dst = (void*)(E + bid * strideE); - size_t const sizeBytes = sizeof(S) * (n - 1); + size_t const sizeBytes = sizeof(S) * E_size; hipMemcpyKind const kind = hipMemcpyHostToDevice; HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); } @@ -2523,6 +2548,30 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, HIP_CHECK(hipHostFree(hE)); hE = nullptr; + if(idebug >= 1) + { + for(auto bid = 0; bid < batch_count; bid++) + { + printf("linfo_array[%d] = %d\n", bid, linfo_array[bid]); + } + } + + { + // ------------------------------------------------------ + // copy linfo_array[] from host to info_array[] on device + // ------------------------------------------------------ + + void* dst = (void*)info_array; + void* src = (void*)&(linfo_array[0]); + size_t nbytes = sizeof(I) * batch_count; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + HIP_CHECK(hipHostFree(linfo_array)); + linfo_array = nullptr; + return (rocblas_status_success); } diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp index deb38adb9..5718f50e0 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp @@ -1088,11 +1088,9 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle, if(use_bdsqr_host) { rocblas_fill const uplo_in = uplo; - auto const info_array = info; istat = rocsolver_bdsqr_host_batch_template( handle, uplo_in, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U, - shiftU, ldu, strideU, C, shiftC, ldc, strideC, info_array, batch_count, splits_map, - work); + shiftU, ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work); } else { From db55c9874a565b62527a647e86963c41522d6aaa Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Tue, 4 Jun 2024 08:39:52 -0400 Subject: [PATCH 04/35] debug snapshot --- library/src/auxiliary/bdsqr_host.hpp | 82 +++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 8 deletions(-) diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp index 69b283317..7bd8cd717 100644 --- a/library/src/auxiliary/bdsqr_host.hpp +++ b/library/src/auxiliary/bdsqr_host.hpp @@ -1,4 +1,4 @@ -#define USE_VT +#undef USE_VT /**************************************************************************** * Derived from the BSD3-licensed @@ -736,6 +736,22 @@ void slasv2_(float* f, float* snl, float* csl); +void crot_(int* n, + std::complex* cx, + int* incx, + std::complex* cy, + int* incy, + float* c, + std::complex* s); + +void zrot_(int* n, + std::complex* cx, + int* incx, + std::complex* cy, + int* incy, + double* c, + std::complex* s); + void zdrot_(int* n, std::complex* zx, int* incx, @@ -1181,6 +1197,7 @@ static void bdsqr_single_template(char uplo, S* const ds = dwork + mn_m1; HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream); HIP_CHECK(hipStreamSynchronize(stream)); @@ -1253,6 +1270,15 @@ static void bdsqr_single_template(char uplo, * test the input parameters. * */ + int const idebug = 2; + if(idebug >= 2) + { + printf("single entry\n"); + for(auto i = 1; i <= (n - 1); i++) + { + printf("e(%d) = %le\n", i, e(i)); + } + } info = (!upper) && (!lower) ? -1 : (n < 0) ? -2 @@ -1528,11 +1554,11 @@ static void bdsqr_single_template(char uplo, { if(use_gpu) { - call_rot_gpu(nrv, v(1, m - 1), 1, v(1, m), 1, cosr, sinr); + call_rot_gpu(nrv, v(1, m - 1), ione, v(1, m), ione, cosr, sinr); } else { - call_rot(nrv, v(1, m - 1), 1, v(1, m), 1, cosr, sinr); + call_rot(nrv, v(1, m - 1), ione, v(1, m), ione, cosr, sinr); } } #endif @@ -2221,11 +2247,11 @@ static void bdsqr_single_template(char uplo, { if(use_gpu) { - call_scal_gpu(nrv, negone, v(1, i), 1); + call_scal_gpu(nrv, negone, v(1, i), ione); } else { - call_scal(nrv, negone, v(1, i), 1); + call_scal(nrv, negone, v(1, i), ione); } } #endif @@ -2331,6 +2357,14 @@ static void bdsqr_single_template(char uplo, } L210: L220: + if(idebug >= 2) + { + printf("single exit\n"); + for(auto i = 1; i <= (n - 1); i++) + { + printf("e(%d) = %le\n", i, e(i)); + } + } return; /* * end of dbdsqr @@ -2365,7 +2399,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, I* splits_map, S* work) { - int const idebug = 1; + int const idebug = 2; // ------------------------- // copy D into hD, E into hE @@ -2380,7 +2414,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, // ------------------------------------------------------------------ // Need to double checking whether array E is length n or length (n-1) // ------------------------------------------------------------------ - bool const use_single_copy_for_E = (strideE == (n - 1)) || (strideE == (n)); + // bool const use_single_copy_for_E = (strideE == (n - 1)) || (strideE == (n)); + bool const use_single_copy_for_E = false; + size_t E_size = use_single_copy_for_E ? strideE : (n - 1); HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * E_size * batch_count)); @@ -2402,7 +2438,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, // transfer arrays D(:) and E(:) from Device to Host // ------------------------------------------------- - bool const use_single_copy_for_D = (strideD == n); + // bool const use_single_copy_for_D = (strideD == n); + bool const use_single_copy_for_D = false; if(use_single_copy_for_D) { void* const dst = (void*)&(hD[0]); @@ -2445,10 +2482,25 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } + HIP_CHECK(hipStreamSynchronize(stream)); + if(idebug >= 1) { printf("batch_count = %d\n", batch_count); printf("n = %d, strideD = %ld, strideE = %ld\n", n, (int64_t)strideD, (int64_t)strideE); + printf("nv = %d, nu = %d, nc = %d\n", nv, nu, nc); + } + if(idebug >= 2) + { + printf("on entry\n"); + for(auto i = 0; i < n; i++) + { + printf("hD[%d] = %le\n", i, hD[i]); + } + for(auto i = 0; i < (n - 1); i++) + { + printf("hE[%d] = %le\n", i, hE[i]); + } } for(I bid = 0; bid < batch_count; bid++) @@ -2543,6 +2595,19 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } + if(idebug >= 2) + { + printf("on exit\n"); + for(auto i = 0; i < n; i++) + { + printf("hD[%d] = %le\n", i, hD[i]); + } + for(auto i = 0; i < (n - 1); i++) + { + printf("hE[%d] = %le\n", i, hE[i]); + } + } + HIP_CHECK(hipHostFree(hD)); hD = nullptr; HIP_CHECK(hipHostFree(hE)); @@ -2567,6 +2632,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, hipMemcpyKind const kind = hipMemcpyHostToDevice; HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); } HIP_CHECK(hipHostFree(linfo_array)); From 972ce8cf7e62f9d3a2acc8f361e5aa84d03012db Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Sun, 16 Jun 2024 14:40:06 -0400 Subject: [PATCH 05/35] debug snapshot --- library/src/auxiliary/bdsqr_host.hpp | 61 ++++++++++++++++++---------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp index 7bd8cd717..869114641 100644 --- a/library/src/auxiliary/bdsqr_host.hpp +++ b/library/src/auxiliary/bdsqr_host.hpp @@ -1,4 +1,4 @@ -#undef USE_VT +#define USE_VT /**************************************************************************** * Derived from the BSD3-licensed @@ -68,6 +68,7 @@ __global__ static void lasr_kernel(char const side, const auto nthreads_per_block = hipBlockDim_x; const auto nthreads = nblocks * nthreads_per_block; const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + const auto i_start = tid; const auto i_inc = nthreads; const auto ij_nb = nthreads; const auto ij_start = tid; @@ -78,6 +79,7 @@ __global__ static void lasr_kernel(char const side, auto indx2f = [](auto i, auto j, auto lda) -> int64_t { assert((1 <= i)); assert((1 <= lda)); + assert(i <= lda); assert((1 <= j)); return ((i - 1) + (j - 1) * int64_t(lda)); @@ -150,7 +152,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = 1 + tid; i <= n; i += i_inc) + for(I i = 1 + i_start; i <= n; i += i_inc) { const auto temp = A(j + 1, i); A(j + 1, i) = ctemp * temp - stemp * A(j, i); @@ -182,7 +184,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(j + 1, i); A(j + 1, i) = ctemp * temp - stemp * A(j, i); @@ -206,7 +208,7 @@ __global__ static void lasr_kernel(char const side, { const auto ctemp = c(j - 1); const auto stemp = s(j - 1); - for(I i = 1 + tid; i <= n; i += i_inc) + for(I i = 1 + i_start; i <= n; i += i_inc) { const auto temp = A(j, i); A(j, i) = ctemp * temp - stemp * A(1, i); @@ -237,7 +239,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j - 1); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(j, i); @@ -270,7 +272,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(j, i); A(j, i) = stemp * A(m, i) + ctemp * temp; @@ -302,7 +304,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(j, i); A(j, i) = stemp * A(m, i) + ctemp * temp; @@ -335,7 +337,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(i, j + 1); A(i, j + 1) = ctemp * temp - stemp * A(i, j); @@ -368,7 +370,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(i, j + 1); A(i, j + 1) = ctemp * temp - stemp * A(i, j); @@ -400,7 +402,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j - 1); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(i, j); @@ -434,7 +436,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j - 1); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(i, j); @@ -468,7 +470,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(i, j); @@ -502,7 +504,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I i = istart + i_start; i <= iend; i += i_inc) { const auto temp = A(i, j); A(i, j) = stemp * A(i, n) + ctemp * temp; @@ -1197,10 +1199,10 @@ static void bdsqr_single_template(char uplo, S* const ds = dwork + mn_m1; HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream); - HIP_CHECK(hipStreamSynchronize(stream)); + // HIP_CHECK(hipStreamSynchronize(stream)); + HIP_CHECK(hipDeviceSynchronize()); }; auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); }; @@ -1278,6 +1280,7 @@ static void bdsqr_single_template(char uplo, { printf("e(%d) = %le\n", i, e(i)); } + fflush(stdout); } info = (!upper) && (!lower) ? -1 @@ -2121,14 +2124,18 @@ static void bdsqr_single_template(char uplo, } L150: e(ll) = f; - /* - * test convergence - */ + // ---------------- + // test convergence + // ---------------- if(abs(e(ll)) <= thresh) e(ll) = zero; - /* - * update singular vectors if desired - */ + + + + // ---------------------------------- + // update singular vectors if desired + // ---------------------------------- + #ifdef USE_VT if(ncvt > 0) { @@ -2209,6 +2216,7 @@ static void bdsqr_single_template(char uplo, call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc); } } + } } /* @@ -2364,6 +2372,7 @@ static void bdsqr_single_template(char uplo, { printf("e(%d) = %le\n", i, e(i)); } + fflush(stdout); } return; /* @@ -2503,6 +2512,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } + S * dwork = nullptr; + HIP_CHECK( hipMalloc( &dwork, sizeof(S)*(4*n)) ); + for(I bid = 0; bid < batch_count; bid++) { std::vector hwork(4 * n); @@ -2515,7 +2527,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, T* u_ = (nu > 0) ? load_ptr_batch(U, bid, shiftU, strideU) : nullptr; T* c_ = (nc > 0) ? load_ptr_batch(C, bid, shiftC, strideC) : nullptr; S* work_ = &(hwork[0]); - S* dwork = work; + // S* dwork = &(work[bid * (4 * n)]); + + I info = 0; I nru = nu; @@ -2613,6 +2627,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, HIP_CHECK(hipHostFree(hE)); hE = nullptr; + HIP_CHECK( hipFree( dwork )); + dwork = nullptr; + if(idebug >= 1) { for(auto bid = 0; bid < batch_count; bid++) From 62d6b5abdca0da6f3a60a31c8b73fc54e8ed1930 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Sun, 16 Jun 2024 21:03:32 -0400 Subject: [PATCH 06/35] debug snapshot --- library/src/auxiliary/bdsqr_host.hpp | 608 ++++++++++++++------------- 1 file changed, 306 insertions(+), 302 deletions(-) diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp index 869114641..c368c271d 100644 --- a/library/src/auxiliary/bdsqr_host.hpp +++ b/library/src/auxiliary/bdsqr_host.hpp @@ -1,4 +1,3 @@ -#define USE_VT /**************************************************************************** * Derived from the BSD3-licensed @@ -45,8 +44,8 @@ ROCSOLVER_BEGIN_NAMESPACE -#ifndef HIP_CHECK -#define HIP_CHECK(fcn) \ +#ifndef CHECK_HIP +#define CHECK_HIP(fcn) \ { \ hipError_t const istat = (fcn); \ assert(istat == hipSuccess); \ @@ -68,7 +67,6 @@ __global__ static void lasr_kernel(char const side, const auto nthreads_per_block = hipBlockDim_x; const auto nthreads = nblocks * nthreads_per_block; const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - const auto i_start = tid; const auto i_inc = nthreads; const auto ij_nb = nthreads; const auto ij_start = tid; @@ -79,7 +77,6 @@ __global__ static void lasr_kernel(char const side, auto indx2f = [](auto i, auto j, auto lda) -> int64_t { assert((1 <= i)); assert((1 <= lda)); - assert(i <= lda); assert((1 <= j)); return ((i - 1) + (j - 1) * int64_t(lda)); @@ -152,7 +149,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = 1 + i_start; i <= n; i += i_inc) + for(I i = 1 + tid; i <= n; i += i_inc) { const auto temp = A(j + 1, i); A(j + 1, i) = ctemp * temp - stemp * A(j, i); @@ -184,7 +181,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(j + 1, i); A(j + 1, i) = ctemp * temp - stemp * A(j, i); @@ -208,7 +205,7 @@ __global__ static void lasr_kernel(char const side, { const auto ctemp = c(j - 1); const auto stemp = s(j - 1); - for(I i = 1 + i_start; i <= n; i += i_inc) + for(I i = 1 + tid; i <= n; i += i_inc) { const auto temp = A(j, i); A(j, i) = ctemp * temp - stemp * A(1, i); @@ -239,7 +236,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j - 1); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(j, i); @@ -272,7 +269,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(j, i); A(j, i) = stemp * A(m, i) + ctemp * temp; @@ -304,7 +301,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(j, i); A(j, i) = stemp * A(m, i) + ctemp * temp; @@ -337,7 +334,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(i, j + 1); A(i, j + 1) = ctemp * temp - stemp * A(i, j); @@ -370,7 +367,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(i, j + 1); A(i, j + 1) = ctemp * temp - stemp * A(i, j); @@ -402,7 +399,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j - 1); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(i, j); @@ -436,7 +433,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j - 1); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(i, j); @@ -470,7 +467,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(i, j); @@ -504,7 +501,7 @@ __global__ static void lasr_kernel(char const side, const auto stemp = s(j); if((ctemp != one) || (stemp != zero)) { - for(I i = istart + i_start; i <= iend; i += i_inc) + for(I i = istart + tid; i <= iend; i += i_inc) { const auto temp = A(i, j); A(i, j) = stemp * A(i, n) + ctemp * temp; @@ -519,6 +516,7 @@ __global__ static void lasr_kernel(char const side, return; } + template static void lasr_template_gpu(char const side, char const pivot, @@ -738,22 +736,6 @@ void slasv2_(float* f, float* snl, float* csl); -void crot_(int* n, - std::complex* cx, - int* incx, - std::complex* cy, - int* incy, - float* c, - std::complex* s); - -void zrot_(int* n, - std::complex* cx, - int* incx, - std::complex* cy, - int* incy, - double* c, - std::complex* s); - void zdrot_(int* n, std::complex* zx, int* incx, @@ -783,6 +765,7 @@ void sscal_(int* n, float* da, float* zx, int* incx); void dlartg_(double* f, double* g, double* c, double* s, double* r); void slartg_(float* f, float* g, float* c, float* s, float* r); + void zlartg_(std::complex* f, std::complex* g, double* c, @@ -798,6 +781,191 @@ void dlas2_(double* f, double* g, double* h, double* ssmin, double* ssmax); void slas2_(float* f, float* g, float* h, float* ssmin, float* ssmax); }; +extern "C" { + +void cbdsqr_(char* uplo, + int* n, + int* ncvt, + int* nru, + int* ncc, + float* d, + float* e, + std::complex* vt, + int* ldvt, + std::complex* u, + int* ldu, + std::complex* c, + int* ldc, + float* rwork, + int* info); + +void zbdsqr_(char* uplo, + int* n, + int* ncvt, + int* nru, + int* ncc, + double* d, + double* e, + std::complex* vt, + int* ldvt, + std::complex* u, + int* ldu, + std::complex* c, + int* ldc, + double* rwork, + int* info); + +void sbdsqr_(char* uplo, + int* n, + int* ncvt, + int* nru, + int* ncc, + float* d, + float* e, + float* vt, + int* ldvt, + float* u, + int* ldu, + float* c, + int* ldc, + float* rwork, + int* info); + +void dbdsqr_(char* uplo, + int* n, + int* ncvt, + int* nru, + int* ncc, + double* d, + double* e, + double* vt, + int* ldvt, + double* u, + int* ldu, + double* c, + int* ldc, + double* rwork, + int* info); +}; + +static void call_bdsqr(char& uplo, + int& n, + int& ncvt, + int& nru, + int& ncc, + double& d, + double& e, + std::complex& vt, + int& ldvt, + std::complex& u, + int& ldu, + std::complex& c, + int& ldc, + double& rwork, + int& info) +{ + zbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex*)&vt, &ldvt, + (std::complex*)&u, &ldu, (std::complex*)&c, &ldc, &rwork, &info); +} + +static void call_bdsqr(char& uplo, + int& n, + int& ncvt, + int& nru, + int& ncc, + double& d, + double& e, + rocblas_complex_num& vt, + int& ldvt, + rocblas_complex_num& u, + int& ldu, + rocblas_complex_num& c, + int& ldc, + double& rwork, + int& info) +{ + zbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex*)&vt, &ldvt, + (std::complex*)&u, &ldu, (std::complex*)&c, &ldc, &rwork, &info); +} + +static void call_bdsqr(char& uplo, + int& n, + int& ncvt, + int& nru, + int& ncc, + float& d, + float& e, + std::complex& vt, + int& ldvt, + std::complex& u, + int& ldu, + std::complex& c, + int& ldc, + float& rwork, + int& info) +{ + cbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex*)&vt, &ldvt, + (std::complex*)&u, &ldu, (std::complex*)&c, &ldc, &rwork, &info); +} + +static void call_bdsqr(char& uplo, + int& n, + int& ncvt, + int& nru, + int& ncc, + float& d, + float& e, + float& vt, + int& ldvt, + float& u, + int& ldu, + float& c, + int& ldc, + float& rwork, + int& info) +{ + sbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info); +} + +static void call_bdsqr(char& uplo, + int& n, + int& ncvt, + int& nru, + int& ncc, + float& d, + float& e, + rocblas_complex_num& vt, + int& ldvt, + rocblas_complex_num& u, + int& ldu, + rocblas_complex_num& c, + int& ldc, + float& rwork, + int& info) +{ + cbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex*)&vt, &ldvt, + (std::complex*)&u, &ldu, (std::complex*)&c, &ldc, &rwork, &info); +} + +static void call_bdsqr(char& uplo, + int& n, + int& ncvt, + int& nru, + int& ncc, + double& d, + double& e, + double& vt, + int& ldvt, + double& u, + int& ldu, + double& c, + int& ldc, + double& rwork, + int& info) +{ + dbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info); +} + static void call_lamch(char& cmach_arg, double& eps) { char cmach = cmach_arg; @@ -1101,33 +1269,26 @@ static void call_lasr(char& side, template static void bdsqr_single_template(char uplo, I n, -#ifdef USE_VT I ncvt, -#else - I ncv, -#endif I nru, I ncc, + S* d_, S* e_, -#ifdef USE_VT + T* vt_, I ldvt, -#else - T* v_, - I ldv, -#endif T* u_, I ldu, T* c_, - int ldc, + I ldc, + S* work_, I& info, S* dwork = nullptr, hipStream_t stream = 0) { bool const use_gpu = (dwork != nullptr); - bool constexpr need_sort = false; S const zero = 0; S const one = 1; @@ -1140,18 +1301,12 @@ static void bdsqr_single_template(char uplo, I const maxitr = 6; I ione = 1; - I nrv = n; - bool const lower = (uplo == 'L') || (uplo == 'l'); bool const upper = (uplo == 'U') || (uplo == 'u'); /* * rotate is true if any singular vectors desired, false otherwise */ -#ifdef USE_VT bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0); -#else - bool const rotate = (nrv > 0) || (nru > 0) || (ncc > 0); -#endif I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0, maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0; @@ -1197,12 +1352,11 @@ static void bdsqr_single_template(char uplo, auto const mn_m1 = (mn - 1); S* const dc = dwork; S* const ds = dwork + mn_m1; - HIP_CHECK(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); - HIP_CHECK(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); + CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); + CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream); - // HIP_CHECK(hipStreamSynchronize(stream)); - HIP_CHECK(hipDeviceSynchronize()); + CHECK_HIP(hipStreamSynchronize(stream)); }; auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); }; @@ -1236,19 +1390,11 @@ static void bdsqr_single_template(char uplo, return (u_[indx2f(i, j, ldu)]); }; -#ifdef USE_VT auto vt = [=](auto i, auto j) -> T& { assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt)); assert((1 <= j) && (j <= ncvt)); return (vt_[indx2f(i, j, ldvt)]); }; -#else - auto v = [=](auto i, auto j) -> T& { - assert((1 <= i) && (i <= nrv) && (nrv <= ldv)); - assert((1 <= j) && (j <= ncv)); - return (v_[indx2f(i, j, ldv)]); - }; -#endif // --------------------------- // emulate Fortran intrinsics @@ -1272,28 +1418,16 @@ static void bdsqr_single_template(char uplo, * test the input parameters. * */ - int const idebug = 2; - if(idebug >= 2) - { - printf("single entry\n"); - for(auto i = 1; i <= (n - 1); i++) - { - printf("e(%d) = %le\n", i, e(i)); - } - fflush(stdout); - } info = (!upper) && (!lower) ? -1 : (n < 0) ? -2 -#ifdef USE_VT - : (ncvt < 0) ? -3 -#else - : (nrv < 0) ? -3 -#endif - : (nru < 0) ? -4 - : (ncc < 0) ? -5 - : (ldu < max(1, nru)) ? -11 - : 0; + : (ncvt < 0) ? -3 + : (nru < 0) ? -4 + : (ncc < 0) ? -5 + : ((ncvt == 0) && (ldvt < 1)) || ((ncvt > 0) && (ldvt < max(1, n)) ? -9 : (ldu < max(1, nru))) + ? -11 + : ((ncc == 0) && (ldc < 1)) || ((ncc > 0) && (ldc < max(1, n))) ? -13 + : 0; if(info != 0) return; @@ -1540,7 +1674,6 @@ static void bdsqr_single_template(char uplo, /* * compute singular vectors, if desired */ -#ifdef USE_VT if(ncvt > 0) { if(use_gpu) @@ -1552,19 +1685,6 @@ static void bdsqr_single_template(char uplo, call_rot(ncvt, vt(m - 1, 1), ldvt, vt(m, 1), ldvt, cosr, sinr); } } -#else - if(nrv > 0) - { - if(use_gpu) - { - call_rot_gpu(nrv, v(1, m - 1), ione, v(1, m), ione, cosr, sinr); - } - else - { - call_rot(nrv, v(1, m - 1), ione, v(1, m), ione, cosr, sinr); - } - } -#endif if(nru > 0) { if(use_gpu) @@ -1758,7 +1878,6 @@ static void bdsqr_single_template(char uplo, /* * update singular vectors */ -#ifdef USE_VT if(ncvt > 0) { // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( @@ -1777,27 +1896,6 @@ static void bdsqr_single_template(char uplo, call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt); } } -#else - - if(nrv > 0) - { - // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), - // v(1,ll ), ldv) - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if(use_gpu) - { - call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv, - dwork, stream); - } - else - { - call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv); - } - } -#endif if(nru > 0) { // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 @@ -1875,7 +1973,6 @@ static void bdsqr_single_template(char uplo, /* * update singular vectors */ -#ifdef USE_VT if(ncvt > 0) { // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( @@ -1896,29 +1993,6 @@ static void bdsqr_single_template(char uplo, vt(ll, 1), ldvt); } } -#else - - if(nrv > 0) - { - // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv ); - char side = 'R'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if(use_gpu) - { - call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1), - v(1, ll), ldv, dwork, stream); - } - else - { - call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1), - v(1, ll), ldv); - } - } - -#endif - if(nru > 0) { // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, @@ -2006,7 +2080,6 @@ static void bdsqr_single_template(char uplo, /* * update singular vectors */ -#ifdef USE_VT if(ncvt > 0) { // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( @@ -2025,26 +2098,6 @@ static void bdsqr_single_template(char uplo, call_lasr(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt); } } -#else - - if(nrv > 0) - { - // call_lasr( 'r', 'v', 'f', nrv, m-ll+1, work( 1 ), work( n ), v( 1, ll ), ldv ) - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; - auto mm = m - ll + 1; - if(use_gpu) - { - call_lasr_gpu(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv, - dwork, stream); - } - else - { - call_lasr(side, pivot, direct, nrv, mm, work(1), work(n), v(1, ll), ldv); - } - } -#endif if(nru > 0) { @@ -2124,19 +2177,14 @@ static void bdsqr_single_template(char uplo, } L150: e(ll) = f; - // ---------------- - // test convergence - // ---------------- + /* + * test convergence + */ if(abs(e(ll)) <= thresh) e(ll) = zero; - - - - // ---------------------------------- - // update singular vectors if desired - // ---------------------------------- - -#ifdef USE_VT + /* + * update singular vectors if desired + */ if(ncvt > 0) { // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( @@ -2157,27 +2205,6 @@ static void bdsqr_single_template(char uplo, vt(ll, 1), ldvt); } } -#else - - if(nrv > 0) - { - // call_lasr( 'r', 'v', 'b', nrv, m-ll+1, work( nm12+1 ), work( nm13+1 ), v( 1,ll ), ldv ) - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; - auto mm = m - ll + 1; - if(use_gpu) - { - call_lasr_gpu(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1), - v(1, ll), ldv, dwork, stream); - } - else - { - call_lasr(side, pivot, direct, nrv, mm, work(nm12 + 1), work(nm13 + 1), - v(1, ll), ldv); - } - } -#endif if(nru > 0) { // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, @@ -2216,7 +2243,6 @@ static void bdsqr_single_template(char uplo, call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc); } } - } } /* @@ -2237,7 +2263,6 @@ static void bdsqr_single_template(char uplo, /* * change sign of singular vectors, if desired */ -#ifdef USE_VT if(ncvt > 0) { if(use_gpu) @@ -2249,107 +2274,73 @@ static void bdsqr_single_template(char uplo, call_scal(ncvt, negone, vt(i, 1), ldvt); } } -#else - - if(nrv > 0) - { - if(use_gpu) - { - call_scal_gpu(nrv, negone, v(1, i), ione); - } - else - { - call_scal(nrv, negone, v(1, i), ione); - } - } -#endif } } L170: - - if(need_sort) - { - /* + /* * sort the singular values into decreasing order (insertion sort on * singular values, but only one transposition per singular vector) */ - // do 190 i = 1, n - 1 - for(i = 1; i <= (n - 1); i++) - { - /* + // do 190 i = 1, n - 1 + for(i = 1; i <= (n - 1); i++) + { + /* * scan for smallest d(i) */ - isub = 1; - smin = d(1); - // do 180 j = 2, n + 1 - i - for(j = 2; j <= (n + 1 - i); j++) + isub = 1; + smin = d(1); + // do 180 j = 2, n + 1 - i + for(j = 2; j <= (n + 1 - i); j++) + { + if(d(j) <= smin) { - if(d(j) <= smin) + isub = j; + smin = d(j); + } + } + L180: + if(isub != n + 1 - i) + { + /* + * swap singular values and vectors + */ + d(isub) = d(n + 1 - i); + d(n + 1 - i) = smin; + if(ncvt > 0) + { + if(use_gpu) { - isub = j; - smin = d(j); + call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + } + else + { + call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); } } - L180: - if(isub != n + 1 - i) + if(nru > 0) { - /* - * swap singular values and vectors - */ - d(isub) = d(n + 1 - i); - d(n + 1 - i) = smin; -#ifdef USE_VT - if(ncvt > 0) + if(use_gpu) { - if(use_gpu) - { - call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); - } - else - { - call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); - } + call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione); } -#else - - if(nrv > 0) + else { - if(use_gpu) - { - call_swap_gpu(nrv, v(1, isub), ione, v(1, n + 1 - i), ione); - } - else - { - call_swap(nrv, v(1, isub), ione, v(1, n + 1 - i), ione); - } + call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione); } -#endif - if(nru > 0) + } + if(ncc > 0) + { + if(use_gpu) { - if(use_gpu) - { - call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione); - } - else - { - call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione); - } + call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); } - if(ncc > 0) + else { - if(use_gpu) - { - call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); - } - else - { - call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); - } + call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); } } } - } // end if (need_sort) - + } L190: goto L220; /* @@ -2365,15 +2356,6 @@ static void bdsqr_single_template(char uplo, } L210: L220: - if(idebug >= 2) - { - printf("single exit\n"); - for(auto i = 1; i <= (n - 1); i++) - { - printf("e(%d) = %le\n", i, e(i)); - } - fflush(stdout); - } return; /* * end of dbdsqr @@ -2419,15 +2401,14 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, S* hD = nullptr; S* hE = nullptr; - HIP_CHECK(hipHostMalloc(&hD, sizeof(S) * n * batch_count)); + HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n)); // ------------------------------------------------------------------ // Need to double checking whether array E is length n or length (n-1) // ------------------------------------------------------------------ - // bool const use_single_copy_for_E = (strideE == (n - 1)) || (strideE == (n)); - bool const use_single_copy_for_E = false; + bool const use_single_copy_for_E = (batch_count == 1) || (strideE == (n - 1)) || (strideE == (n)); - size_t E_size = use_single_copy_for_E ? strideE : (n - 1); - HIP_CHECK(hipHostMalloc(&hE, sizeof(S) * E_size * batch_count)); + size_t E_size = (batch_count == 1) || (strideE == (n - 1)) ? (n - 1) : n; + HIP_CHECK(hipHostMalloc(&hE, (sizeof(S) * std::max(1, batch_count)) * E_size)); // ---------------------------------------------------- // copy info_array[] on device to linfo_array[] on host @@ -2447,8 +2428,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, // transfer arrays D(:) and E(:) from Device to Host // ------------------------------------------------- - // bool const use_single_copy_for_D = (strideD == n); - bool const use_single_copy_for_D = false; + bool const use_single_copy_for_D = (batch_count == 1) || (strideD == n); if(use_single_copy_for_D) { void* const dst = (void*)&(hD[0]); @@ -2512,8 +2492,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } - S * dwork = nullptr; - HIP_CHECK( hipMalloc( &dwork, sizeof(S)*(4*n)) ); + S* dwork = nullptr; + HIP_CHECK(hipMalloc(&dwork, sizeof(S) * (4 * n))); for(I bid = 0; bid < batch_count; bid++) { @@ -2529,13 +2509,11 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, S* work_ = &(hwork[0]); // S* dwork = &(work[bid * (4 * n)]); - I info = 0; I nru = nu; I ncc = nc; -#ifdef USE_VT // ------------------------------------------------------- // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V // as input variable @@ -2547,15 +2525,39 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, I nrv = n; I ncvt = nv; - bdsqr_single_template(uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu, c_, ldc, - work_, info, dwork, stream); -#else - I ncv = nv; + bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0); + if(values_only) + { + // -------------------------------- + // call the lapack version of bdsqr + // -------------------------------- + auto ln = n; + auto lncvt = ncvt; + auto lnru = nru; + auto lncc = ncc; + S& d_arg = d_[0]; + S& e_arg = e_[0]; + T& vt_arg = vt_[0]; + T& u_arg = u_[0]; + T& c_arg = c_[0]; + S& work_arg = work_[0]; + auto ldvt_arg = ldvt; + auto ldu_arg = ldu; + auto ldc_arg = ldc; + + call_bdsqr(uplo, ln, lncvt, lnru, lncc, d_arg, e_arg, vt_arg, ldvt_arg, u_arg, ldu_arg, + c_arg, ldu_arg, work_arg, info); + } + else + { + bdsqr_single_template(uplo, n, ncvt, nru, ncc, - bdsqr_single_template(uplo, n, ncv, nru, ncc, d_, e_, v_, ldv, u_, ldu, c_, ldc, - work_, info, dwork, stream); + d_, e_, -#endif + vt_, ldvt, u_, ldu, c_, ldc, + + work_, info, dwork, stream); + } if(linfo_array[bid] == 0) { @@ -2609,6 +2611,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } + HIP_CHECK(hipStreamSynchronize(stream)); + if(idebug >= 2) { printf("on exit\n"); @@ -2627,7 +2631,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, HIP_CHECK(hipHostFree(hE)); hE = nullptr; - HIP_CHECK( hipFree( dwork )); + HIP_CHECK(hipFree(dwork)); dwork = nullptr; if(idebug >= 1) From 3fc89f708bebc0e44d3fa3a1b40d364fd94d7c03 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Mon, 17 Jun 2024 15:53:27 -0400 Subject: [PATCH 07/35] debug snapshot --- library/src/auxiliary/bdsqr_host.hpp | 111 +++++++++++++++------------ 1 file changed, 60 insertions(+), 51 deletions(-) diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp index c368c271d..8ee126a9a 100644 --- a/library/src/auxiliary/bdsqr_host.hpp +++ b/library/src/auxiliary/bdsqr_host.hpp @@ -2401,29 +2401,28 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, S* hD = nullptr; S* hE = nullptr; - HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n)); - // ------------------------------------------------------------------ - // Need to double checking whether array E is length n or length (n-1) - // ------------------------------------------------------------------ - bool const use_single_copy_for_E = (batch_count == 1) || (strideE == (n - 1)) || (strideE == (n)); - size_t E_size = (batch_count == 1) || (strideE == (n - 1)) ? (n - 1) : n; + size_t const E_size = (n - 1); + HIP_CHECK(hipHostMalloc(&hD, (sizeof(S) * std::max(1, batch_count)) * n)); HIP_CHECK(hipHostMalloc(&hE, (sizeof(S) * std::max(1, batch_count)) * E_size)); // ---------------------------------------------------- // copy info_array[] on device to linfo_array[] on host // ---------------------------------------------------- I* linfo_array = nullptr; - HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * batch_count)); + HIP_CHECK(hipHostMalloc(&linfo_array, sizeof(I) * std::max(1, batch_count))); { - void* dst = &(linfo_array[0]); - void* src = info_array; - size_t nbytes = sizeof(I) * batch_count; + void* const dst = &(linfo_array[0]); + void* const src = &(info_array[0]); + size_t const nbytes = sizeof(I) * batch_count; hipMemcpyKind const kind = hipMemcpyDeviceToHost; HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); } + S* hwork = nullptr; + HIP_CHECK(hipHostMalloc(&hwork, sizeof(S) * (4 * n))); + // ------------------------------------------------- // transfer arrays D(:) and E(:) from Device to Host // ------------------------------------------------- @@ -2450,16 +2449,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } - if(use_single_copy_for_E) - { - void* const dst = (void*)&(hE[0]); - void* const src = (void*)&(E[0]); - size_t const sizeBytes = sizeof(S) * E_size * batch_count; - hipMemcpyKind const kind = hipMemcpyDeviceToHost; - - HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); - } - else { for(I bid = 0; bid < batch_count; bid++) { @@ -2497,7 +2486,17 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, for(I bid = 0; bid < batch_count; bid++) { - std::vector hwork(4 * n); + if(idebug >= 1) + { + printf("on entry: linfo_array[%d] = %d\n", bid, linfo_array[bid]); + } + + if(linfo_array[bid] != 0) + { + continue; + }; + + // std::vector hwork(4 * n); char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U'; S* d_ = &(hD[bid * n]); @@ -2559,6 +2558,18 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, work_, info, dwork, stream); } + if(info == 0) + { + // ---------------------------- + // explicitly zero out "E" array + // ---------------------------- + S const zero = S(0); + for(I i = 0; i < (n - 1); i++) + { + e_[i] = zero; + } + } + if(linfo_array[bid] == 0) { linfo_array[bid] = info; @@ -2590,28 +2601,32 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } - if(use_single_copy_for_E) - { - void* const src = (void*)&(hE[0]); - void* const dst = (void*)E; - size_t const sizeBytes = sizeof(S) * E_size * batch_count; - hipMemcpyKind const kind = hipMemcpyHostToDevice; - - HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); - } - else { for(I bid = 0; bid < batch_count; bid++) { void* const src = (void*)&(hE[bid * E_size]); - void* const dst = (void*)(E + bid * strideE); + void* const dst = (void*)&(E[bid * strideE]); size_t const sizeBytes = sizeof(S) * E_size; hipMemcpyKind const kind = hipMemcpyHostToDevice; HIP_CHECK(hipMemcpyAsync(dst, src, sizeBytes, kind, stream)); } } + { + // ------------------------------------------------------ + // copy linfo_array[] from host to info_array[] on device + // ------------------------------------------------------ + + void* const src = (void*)&(linfo_array[0]); + void* const dst = (void*)&(info_array[0]); + size_t const nbytes = sizeof(I) * batch_count; + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + HIP_CHECK(hipStreamSynchronize(stream)); + HIP_CHECK(hipDeviceSynchronize()); if(idebug >= 2) { @@ -2626,14 +2641,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } - HIP_CHECK(hipHostFree(hD)); - hD = nullptr; - HIP_CHECK(hipHostFree(hE)); - hE = nullptr; - - HIP_CHECK(hipFree(dwork)); - dwork = nullptr; - if(idebug >= 1) { for(auto bid = 0; bid < batch_count; bid++) @@ -2642,19 +2649,21 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } - { - // ------------------------------------------------------ - // copy linfo_array[] from host to info_array[] on device - // ------------------------------------------------------ + // ---------------------- + // free allocated storage + // ---------------------- - void* dst = (void*)info_array; - void* src = (void*)&(linfo_array[0]); - size_t nbytes = sizeof(I) * batch_count; - hipMemcpyKind const kind = hipMemcpyHostToDevice; + HIP_CHECK(hipHostFree(hwork)); + hwork = nullptr; - HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - HIP_CHECK(hipStreamSynchronize(stream)); - } + HIP_CHECK(hipHostFree(hD)); + hD = nullptr; + + HIP_CHECK(hipHostFree(hE)); + hE = nullptr; + + HIP_CHECK(hipFree(dwork)); + dwork = nullptr; HIP_CHECK(hipHostFree(linfo_array)); linfo_array = nullptr; From 3381870b6c729aaa3bb271d04795559be846625f Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Mon, 17 Jun 2024 16:12:56 -0400 Subject: [PATCH 08/35] initial working version --- library/src/auxiliary/bdsqr_host.hpp | 48 ---------------------------- 1 file changed, 48 deletions(-) diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/bdsqr_host.hpp index 8ee126a9a..0ec3436b1 100644 --- a/library/src/auxiliary/bdsqr_host.hpp +++ b/library/src/auxiliary/bdsqr_host.hpp @@ -2390,8 +2390,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, I* splits_map, S* work) { - int const idebug = 2; - // ------------------------- // copy D into hD, E into hE // ------------------------- @@ -2462,35 +2460,11 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, HIP_CHECK(hipStreamSynchronize(stream)); - if(idebug >= 1) - { - printf("batch_count = %d\n", batch_count); - printf("n = %d, strideD = %ld, strideE = %ld\n", n, (int64_t)strideD, (int64_t)strideE); - printf("nv = %d, nu = %d, nc = %d\n", nv, nu, nc); - } - if(idebug >= 2) - { - printf("on entry\n"); - for(auto i = 0; i < n; i++) - { - printf("hD[%d] = %le\n", i, hD[i]); - } - for(auto i = 0; i < (n - 1); i++) - { - printf("hE[%d] = %le\n", i, hE[i]); - } - } - S* dwork = nullptr; HIP_CHECK(hipMalloc(&dwork, sizeof(S) * (4 * n))); for(I bid = 0; bid < batch_count; bid++) { - if(idebug >= 1) - { - printf("on entry: linfo_array[%d] = %d\n", bid, linfo_array[bid]); - } - if(linfo_array[bid] != 0) { continue; @@ -2626,28 +2600,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } HIP_CHECK(hipStreamSynchronize(stream)); - HIP_CHECK(hipDeviceSynchronize()); - - if(idebug >= 2) - { - printf("on exit\n"); - for(auto i = 0; i < n; i++) - { - printf("hD[%d] = %le\n", i, hD[i]); - } - for(auto i = 0; i < (n - 1); i++) - { - printf("hE[%d] = %le\n", i, hE[i]); - } - } - - if(idebug >= 1) - { - for(auto bid = 0; bid < batch_count; bid++) - { - printf("linfo_array[%d] = %d\n", bid, linfo_array[bid]); - } - } // ---------------------- // free allocated storage From 915c972a18a194b8747f16322b3136e26ec2ca94 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Mon, 17 Jun 2024 16:19:52 -0400 Subject: [PATCH 09/35] rename bdsqr_host.hpp to rocauxiliary_bdsqr_host.hpp --- library/src/auxiliary/rocauxiliary_bdsqr.hpp | 2 +- .../auxiliary/{bdsqr_host.hpp => rocauxiliary_bdsqr_host.hpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename library/src/auxiliary/{bdsqr_host.hpp => rocauxiliary_bdsqr_host.hpp} (100%) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp index 5718f50e0..937a52f1a 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp @@ -38,7 +38,7 @@ #include -#include "bdsqr_host.hpp" +#include "rocauxiliary_bdsqr_host.hpp" ROCSOLVER_BEGIN_NAMESPACE diff --git a/library/src/auxiliary/bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp similarity index 100% rename from library/src/auxiliary/bdsqr_host.hpp rename to library/src/auxiliary/rocauxiliary_bdsqr_host.hpp From d8b24bec0e6b93fc2a825a971dee7cf422baaa3e Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Tue, 18 Jun 2024 10:50:39 -0400 Subject: [PATCH 10/35] use more cache by using more thread blocks --- library/src/auxiliary/rocauxiliary_bdsqr_host.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 0ec3436b1..99a5340be 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -74,12 +74,13 @@ __global__ static void lasr_kernel(char const side, auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; auto min = [](auto x, auto y) { return ((x < y) ? x : y); }; - auto indx2f = [](auto i, auto j, auto lda) -> int64_t { + auto indx2f = [](auto i, auto j, auto lda) { assert((1 <= i)); assert((1 <= lda)); assert((1 <= j)); - return ((i - 1) + (j - 1) * int64_t(lda)); + // return ((i - 1) + (j - 1) * int64_t(lda)); + return (i + j * lda - (1 + lda)); }; auto indx1f = [](auto i) -> int64_t { @@ -529,7 +530,7 @@ static void lasr_template_gpu(char const side, I const lda, hipStream_t stream = 0) { - auto const nthreads = 2 * warpSize; + auto const nthreads = warpSize; bool const is_left_side = (side == 'L') || (side == 'l'); auto const mn = (is_left_side) ? n : m; From f8cf1bd3b6bb95e8d4e8449473ea70d9b7f63fe4 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Tue, 18 Jun 2024 12:07:15 -0400 Subject: [PATCH 11/35] development snapshot --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 300 ++++++++++++++---- 1 file changed, 246 insertions(+), 54 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 99a5340be..d144b31a8 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -1286,10 +1286,10 @@ static void bdsqr_single_template(char uplo, S* work_, I& info, - S* dwork = nullptr, + S* dwork_ = nullptr, hipStream_t stream = 0) { - bool const use_gpu = (dwork != nullptr); + bool const use_gpu = (dwork_ != nullptr); S const zero = 0; S const one = 1; @@ -1345,14 +1345,25 @@ static void bdsqr_single_template(char uplo, auto call_scal_gpu = [=](I n, auto da, T& x, I incx) { scal_template(n, da, &x, incx, stream); }; + auto call_lasr_gpu_nocopy = [=](char const side, char const pivot, char const direct, I const m, + I const n, S& dc, S& ds, T& A, I const lda, hipStream_t stream) { + bool const is_left_side = (side == 'L') || (side == 'l'); + auto const mn = (is_left_side) ? m : n; + auto const mn_m1 = (mn - 1); + + lasr_template_gpu(side, pivot, direct, m, n, &dc, &ds, &A, lda, stream); + }; + auto call_lasr_gpu = [=](char const side, char const pivot, char const direct, I const m, I const n, S& c, - S& s, T& A, I const lda, S* const dwork, hipStream_t stream) { + S& s, T& A, I const lda, S* const dwork_, hipStream_t stream) { bool const is_left_side = (side == 'L') || (side == 'l'); auto const mn = (is_left_side) ? m : n; auto const mn_m1 = (mn - 1); - S* const dc = dwork; - S* const ds = dwork + mn_m1; + S* const dc = dwork_; + S* const ds = dwork_ + mn_m1; + CHECK_HIP(hipStreamSynchronize(stream)); + CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); @@ -1378,6 +1389,7 @@ static void bdsqr_single_template(char uplo, return (e_[i - 1]); }; auto work = [=](auto i) -> S& { return (work_[i - 1]); }; + auto dwork = [=](auto i) -> S& { return (dwork_[i - 1]); }; auto c = [=](auto i, auto j) -> T& { assert((1 <= i) && (i <= nrc) && (nrc <= ldc)); @@ -1435,6 +1447,10 @@ static void bdsqr_single_template(char uplo, if(n == 0) return; + + bool const need_update_singular_vectors = (nru > 0) || (ncc > 0); + bool constexpr use_lasr_gpu_nocopy = true; + if(n == 1) goto L160; /* @@ -1484,9 +1500,32 @@ static void bdsqr_single_template(char uplo, } L10: - /* - * update singular vectors if desired - */ + // ---------------------------------- + // update singular vectors if desired + // ---------------------------------- + CHECK_HIP(hipStreamSynchronize(stream)); + + if(need_update_singular_vectors) + { + // -------------- + // copy rotations + // -------------- + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + CHECK_HIP(hipStreamSynchronize(stream)); if(nru > 0) { @@ -1496,8 +1535,16 @@ static void bdsqr_single_template(char uplo, char direct = 'F'; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu, dwork, - stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, n, dwork(1), dwork(n), u(1, 1), + ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, n, work(1), work(n), u(1, 1), ldu, + dwork_, stream); + } } else { @@ -1512,8 +1559,16 @@ static void bdsqr_single_template(char uplo, char direct = 'F'; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc, dwork, - stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, n, ncc, dwork(1), dwork(n), c(1, 1), + ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, n, ncc, work(1), work(n), c(1, 1), ldc, + dwork_, stream); + } } else { @@ -1876,9 +1931,54 @@ static void bdsqr_single_template(char uplo, h = d(m) * cs; d(m) = h * oldcs; e(m - 1) = h * oldsn; - /* - * update singular vectors - */ + // + // ----------------------- + // update singular vectors + // ----------------------- + + CHECK_HIP(hipStreamSynchronize(stream)); + + if(rotate) + { + // -------------- + // copy rotations + // -------------- + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + if(ncvt > 0) + { + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + + if((nru > 0) || (ncc > 0)) + { + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + } + + CHECK_HIP(hipStreamSynchronize(stream)); + if(ncvt > 0) { // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( @@ -1889,8 +1989,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt, - dwork, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(1), dwork(n), + vt(ll, 1), ldvt, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), + ldvt, dwork_, stream); + } } else { @@ -1907,8 +2015,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), - u(1, ll), ldu, dwork, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(nm12 + 1), + dwork(nm13 + 1), u(1, ll), ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu, dwork_, stream); + } } else { @@ -1926,8 +2042,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), - c(ll, 1), ldc, dwork, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(nm12 + 1), + dwork(nm13 + 1), c(ll, 1), ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc, dwork_, stream); + } } else { @@ -1935,9 +2059,9 @@ static void bdsqr_single_template(char uplo, c(ll, 1), ldc); } } - /* - * test convergence - */ + // + // test convergence + // if(abs(e(m - 1)) <= thresh) e(m - 1) = zero; } @@ -1971,9 +2095,53 @@ static void bdsqr_single_template(char uplo, h = d(ll) * cs; d(ll) = h * oldcs; e(ll) = h * oldsn; - /* - * update singular vectors - */ + // + // update singular vectors + // + + CHECK_HIP(hipStreamSynchronize(stream)); + + if(rotate) + { + // -------------- + // copy rotations + // -------------- + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + if((nru > 0) || (ncc > 0)) + { + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + + if(ncvt > 0) + { + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + } + + CHECK_HIP(hipStreamSynchronize(stream)); + if(ncvt > 0) { // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( @@ -1985,8 +2153,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), - vt(ll, 1), ldvt, dwork, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(nm12 + 1), + dwork(nm13 + 1), vt(ll, 1), ldvt, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt, dwork_, stream); + } } else { @@ -2005,8 +2181,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu, - dwork, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(1), dwork(n), + u(1, ll), ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu, + dwork_, stream); + } } else { @@ -2024,32 +2208,40 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc, - dwork, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(1), dwork(n), + c(ll, 1), ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc, + dwork_, stream); + } } else { call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc); } } - /* - * test convergence - */ + // + // test convergence + // if(abs(e(ll)) <= thresh) e(ll) = zero; } } else { - /* - * use nonzero shift - */ + // + // use nonzero shift + // if(idir == 1) { - /* - * chase bulge from top to bottom - * save cosines and sines for later singular vector updates - */ + // + // chase bulge from top to bottom + // save cosines and sines for later singular vector updates + // f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll)); g = e(ll); // do 140 i = ll, m - 1 @@ -2092,7 +2284,7 @@ static void bdsqr_single_template(char uplo, if(use_gpu) { call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt, - dwork, stream); + dwork_, stream); } else { @@ -2111,7 +2303,7 @@ static void bdsqr_single_template(char uplo, if(use_gpu) { call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), - u(1, ll), ldu, dwork, stream); + u(1, ll), ldu, dwork_, stream); } else { @@ -2130,7 +2322,7 @@ static void bdsqr_single_template(char uplo, if(use_gpu) { call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), - c(ll, 1), ldc, dwork, stream); + c(ll, 1), ldc, dwork_, stream); } else { @@ -2198,7 +2390,7 @@ static void bdsqr_single_template(char uplo, if(use_gpu) { call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), - vt(ll, 1), ldvt, dwork, stream); + vt(ll, 1), ldvt, dwork_, stream); } else { @@ -2218,7 +2410,7 @@ static void bdsqr_single_template(char uplo, if(use_gpu) { call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu, - dwork, stream); + dwork_, stream); } else { @@ -2237,7 +2429,7 @@ static void bdsqr_single_template(char uplo, if(use_gpu) { call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc, - dwork, stream); + dwork_, stream); } else { @@ -2461,8 +2653,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, HIP_CHECK(hipStreamSynchronize(stream)); - S* dwork = nullptr; - HIP_CHECK(hipMalloc(&dwork, sizeof(S) * (4 * n))); + S* dwork_ = nullptr; + HIP_CHECK(hipMalloc(&dwork_, sizeof(S) * (4 * n))); for(I bid = 0; bid < batch_count; bid++) { @@ -2530,7 +2722,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, vt_, ldvt, u_, ldu, c_, ldc, - work_, info, dwork, stream); + work_, info, dwork_, stream); } if(info == 0) @@ -2615,8 +2807,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, HIP_CHECK(hipHostFree(hE)); hE = nullptr; - HIP_CHECK(hipFree(dwork)); - dwork = nullptr; + HIP_CHECK(hipFree(dwork_)); + dwork_ = nullptr; HIP_CHECK(hipHostFree(linfo_array)); linfo_array = nullptr; From 5ddbbf85c124f48849c8267b44d987604a3076ea Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Tue, 18 Jun 2024 12:15:35 -0400 Subject: [PATCH 12/35] development snapshot --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 86 +++++++++++++++++-- 1 file changed, 77 insertions(+), 9 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index d144b31a8..48f12dc8c 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -2270,9 +2270,53 @@ static void bdsqr_single_template(char uplo, } L140: e(m - 1) = f; - /* - * update singular vectors - */ + // + // update singular vectors + // + + CHECK_HIP(hipStreamSynchronize(stream)); + + if(rotate) + { + // -------------- + // copy rotations + // -------------- + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + if(ncvt > 0) + { + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + + if((nru > 0) || (ncc > 0)) + { + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + } + + CHECK_HIP(hipStreamSynchronize(stream)); + if(ncvt > 0) { // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( @@ -2283,8 +2327,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), ldvt, - dwork_, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(1), dwork(n), + vt(ll, 1), ldvt, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(1), work(n), vt(ll, 1), + ldvt, dwork_, stream); + } } else { @@ -2302,8 +2354,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), - u(1, ll), ldu, dwork_, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(nm12 + 1), + dwork(nm13 + 1), u(1, ll), ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(nm12 + 1), work(nm13 + 1), + u(1, ll), ldu, dwork_, stream); + } } else { @@ -2321,8 +2381,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), - c(ll, 1), ldc, dwork_, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(nm12 + 1), + dwork(nm13 + 1), c(ll, 1), ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(nm12 + 1), work(nm13 + 1), + c(ll, 1), ldc, dwork_, stream); + } } else { From 129c3a6e78adfaac69be99f57049f7d9c8437d7e Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Tue, 18 Jun 2024 13:45:23 -0400 Subject: [PATCH 13/35] development snapshot --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 364 +++++++++++------- 1 file changed, 224 insertions(+), 140 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 48f12dc8c..316834dbd 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -1503,29 +1503,33 @@ static void bdsqr_single_template(char uplo, // ---------------------------------- // update singular vectors if desired // ---------------------------------- - CHECK_HIP(hipStreamSynchronize(stream)); - if(need_update_singular_vectors) + if(use_lasr_gpu_nocopy) { - // -------------- - // copy rotations - // -------------- - size_t const nbytes = sizeof(S) * (n - 1); - hipMemcpyKind const kind = hipMemcpyHostToDevice; + CHECK_HIP(hipStreamSynchronize(stream)); + if(need_update_singular_vectors) { - void* const src = (void*)&(work(1)); - void* const dst = (void*)&(dwork(1)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - } + // -------------- + // copy rotations + // -------------- + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; - { - void* const src = (void*)&(work(n)); - void* const dst = (void*)&(dwork(n)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } } + CHECK_HIP(hipStreamSynchronize(stream)); } - CHECK_HIP(hipStreamSynchronize(stream)); if(nru > 0) { @@ -1936,48 +1940,51 @@ static void bdsqr_single_template(char uplo, // update singular vectors // ----------------------- - CHECK_HIP(hipStreamSynchronize(stream)); - - if(rotate) + if(use_lasr_gpu_nocopy) { - // -------------- - // copy rotations - // -------------- - size_t const nbytes = sizeof(S) * (n - 1); - hipMemcpyKind const kind = hipMemcpyHostToDevice; + CHECK_HIP(hipStreamSynchronize(stream)); - if(ncvt > 0) + if(rotate) { - { - void* const src = (void*)&(work(1)); - void* const dst = (void*)&(dwork(1)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - } + // -------------- + // copy rotations + // -------------- + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + if(ncvt > 0) { - void* const src = (void*)&(work(n)); - void* const dst = (void*)&(dwork(n)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } } - } - if((nru > 0) || (ncc > 0)) - { + if((nru > 0) || (ncc > 0)) { - void* const src = (void*)&(work(nm12)); - void* const dst = (void*)&(dwork(nm12)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - } - - { - void* const src = (void*)&(work(nm13)); - void* const dst = (void*)&(dwork(nm13)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } } } - } - CHECK_HIP(hipStreamSynchronize(stream)); + CHECK_HIP(hipStreamSynchronize(stream)); + } if(ncvt > 0) { @@ -2099,48 +2106,51 @@ static void bdsqr_single_template(char uplo, // update singular vectors // - CHECK_HIP(hipStreamSynchronize(stream)); - - if(rotate) + if(use_lasr_gpu_nocopy) { - // -------------- - // copy rotations - // -------------- - size_t const nbytes = sizeof(S) * (n - 1); - hipMemcpyKind const kind = hipMemcpyHostToDevice; + CHECK_HIP(hipStreamSynchronize(stream)); - if((nru > 0) || (ncc > 0)) + if(rotate) { - { - void* const src = (void*)&(work(1)); - void* const dst = (void*)&(dwork(1)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - } - - { - void* const src = (void*)&(work(n)); - void* const dst = (void*)&(dwork(n)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - } - } + // -------------- + // copy rotations + // -------------- + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; - if(ncvt > 0) - { + if((nru > 0) || (ncc > 0)) { - void* const src = (void*)&(work(nm12)); - void* const dst = (void*)&(dwork(nm12)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } } + if(ncvt > 0) { - void* const src = (void*)&(work(nm13)); - void* const dst = (void*)&(dwork(nm13)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } } } - } - CHECK_HIP(hipStreamSynchronize(stream)); + CHECK_HIP(hipStreamSynchronize(stream)); + } if(ncvt > 0) { @@ -2274,48 +2284,51 @@ static void bdsqr_single_template(char uplo, // update singular vectors // - CHECK_HIP(hipStreamSynchronize(stream)); - - if(rotate) + if(use_lasr_gpu_nocopy) { - // -------------- - // copy rotations - // -------------- - size_t const nbytes = sizeof(S) * (n - 1); - hipMemcpyKind const kind = hipMemcpyHostToDevice; + CHECK_HIP(hipStreamSynchronize(stream)); - if(ncvt > 0) + if(rotate) { - { - void* const src = (void*)&(work(1)); - void* const dst = (void*)&(dwork(1)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - } + // -------------- + // copy rotations + // -------------- + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + if(ncvt > 0) { - void* const src = (void*)&(work(n)); - void* const dst = (void*)&(dwork(n)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } } - } - if((nru > 0) || (ncc > 0)) - { + if((nru > 0) || (ncc > 0)) { - void* const src = (void*)&(work(nm12)); - void* const dst = (void*)&(dwork(nm12)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); - } - - { - void* const src = (void*)&(work(nm13)); - void* const dst = (void*)&(dwork(nm13)); - CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } } } - } - CHECK_HIP(hipStreamSynchronize(stream)); + CHECK_HIP(hipStreamSynchronize(stream)); + } if(ncvt > 0) { @@ -2438,14 +2451,61 @@ static void bdsqr_single_template(char uplo, } L150: e(ll) = f; - /* - * test convergence - */ + // + // test convergence + // if(abs(e(ll)) <= thresh) e(ll) = zero; - /* - * update singular vectors if desired - */ + // + // update singular vectors if desired + // + + if(use_lasr_gpu_nocopy) + { + CHECK_HIP(hipStreamSynchronize(stream)); + + if(rotate) + { + // -------------- + // copy rotations + // -------------- + size_t const nbytes = sizeof(S) * (n - 1); + hipMemcpyKind const kind = hipMemcpyHostToDevice; + + if((nru > 0) || (ncc > 0)) + { + { + void* const src = (void*)&(work(1)); + void* const dst = (void*)&(dwork(1)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(n)); + void* const dst = (void*)&(dwork(n)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + + if(ncvt > 0) + { + { + void* const src = (void*)&(work(nm12)); + void* const dst = (void*)&(dwork(nm12)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + + { + void* const src = (void*)&(work(nm13)); + void* const dst = (void*)&(dwork(nm13)); + CHECK_HIP(hipMemcpyAsync(dst, src, nbytes, kind, stream)); + } + } + } + + CHECK_HIP(hipStreamSynchronize(stream)); + } + if(ncvt > 0) { // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( @@ -2457,8 +2517,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), - vt(ll, 1), ldvt, dwork_, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncvt, dwork(nm12 + 1), + dwork(nm13 + 1), vt(ll, 1), ldvt, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncvt, work(nm12 + 1), work(nm13 + 1), + vt(ll, 1), ldvt, dwork_, stream); + } } else { @@ -2477,8 +2545,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu, - dwork_, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, nru, mm, dwork(1), dwork(n), + u(1, ll), ldu, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, nru, mm, work(1), work(n), u(1, ll), ldu, + dwork_, stream); + } } else { @@ -2496,8 +2572,16 @@ static void bdsqr_single_template(char uplo, auto mm = m - ll + 1; if(use_gpu) { - call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc, - dwork_, stream); + if(use_lasr_gpu_nocopy) + { + call_lasr_gpu_nocopy(side, pivot, direct, mm, ncc, dwork(1), dwork(n), + c(ll, 1), ldc, stream); + } + else + { + call_lasr_gpu(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc, + dwork_, stream); + } } else { @@ -2521,9 +2605,9 @@ static void bdsqr_single_template(char uplo, if(d(i) < zero) { d(i) = -d(i); - /* - * change sign of singular vectors, if desired - */ + // + // change sign of singular vectors, if desired + // if(ncvt > 0) { if(use_gpu) @@ -2538,16 +2622,16 @@ static void bdsqr_single_template(char uplo, } } L170: - /* - * sort the singular values into decreasing order (insertion sort on - * singular values, but only one transposition per singular vector) - */ + // + // sort the singular values into decreasing order (insertion sort on + // singular values, but only one transposition per singular vector) + // // do 190 i = 1, n - 1 for(i = 1; i <= (n - 1); i++) { - /* - * scan for smallest d(i) - */ + // + // scan for smallest d(i) + // isub = 1; smin = d(1); // do 180 j = 2, n + 1 - i @@ -2562,9 +2646,9 @@ static void bdsqr_single_template(char uplo, L180: if(isub != n + 1 - i) { - /* - * swap singular values and vectors - */ + // + // swap singular values and vectors + // d(isub) = d(n + 1 - i); d(n + 1 - i) = smin; if(ncvt > 0) @@ -2604,9 +2688,9 @@ static void bdsqr_single_template(char uplo, } L190: goto L220; -/* - * maximum number of iterations exceeded, failure to converge - */ +// +// maximum number of iterations exceeded, failure to converge +// L200: info = 0; // do 210 i = 1, n - 1 @@ -2618,9 +2702,9 @@ static void bdsqr_single_template(char uplo, L210: L220: return; - /* - * end of dbdsqr - */ + // + // end of dbdsqr + // } template From f02b735e5c77370004ddcd7fcc4f9c23f2f3eb41 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Fri, 21 Jun 2024 11:57:09 -0400 Subject: [PATCH 14/35] handle batch case with array of pointers on device --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 177 +++++++++++++----- 1 file changed, 132 insertions(+), 45 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 316834dbd..34a21a212 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -1290,6 +1290,7 @@ static void bdsqr_single_template(char uplo, hipStream_t stream = 0) { bool const use_gpu = (dwork_ != nullptr); + bool constexpr need_sort = false; S const zero = 0; S const one = 1; @@ -1449,7 +1450,7 @@ static void bdsqr_single_template(char uplo, return; bool const need_update_singular_vectors = (nru > 0) || (ncc > 0); - bool constexpr use_lasr_gpu_nocopy = true; + bool constexpr use_lasr_gpu_nocopy = false; if(n == 1) goto L160; @@ -2590,6 +2591,7 @@ static void bdsqr_single_template(char uplo, } } } + CHECK_HIP(hipStreamSynchronize(stream)); /* * qr iteration finished, go back and check convergence */ @@ -2627,61 +2629,64 @@ static void bdsqr_single_template(char uplo, // singular values, but only one transposition per singular vector) // // do 190 i = 1, n - 1 - for(i = 1; i <= (n - 1); i++) + if(need_sort) { - // - // scan for smallest d(i) - // - isub = 1; - smin = d(1); - // do 180 j = 2, n + 1 - i - for(j = 2; j <= (n + 1 - i); j++) - { - if(d(j) <= smin) - { - isub = j; - smin = d(j); - } - } - L180: - if(isub != n + 1 - i) + for(i = 1; i <= (n - 1); i++) { // - // swap singular values and vectors + // scan for smallest d(i) // - d(isub) = d(n + 1 - i); - d(n + 1 - i) = smin; - if(ncvt > 0) + isub = 1; + smin = d(1); + // do 180 j = 2, n + 1 - i + for(j = 2; j <= (n + 1 - i); j++) { - if(use_gpu) - { - call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); - } - else + if(d(j) <= smin) { - call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + isub = j; + smin = d(j); } } - if(nru > 0) + L180: + if(isub != n + 1 - i) { - if(use_gpu) - { - call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione); - } - else + // + // swap singular values and vectors + // + d(isub) = d(n + 1 - i); + d(n + 1 - i) = smin; + if(ncvt > 0) { - call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione); + if(use_gpu) + { + call_swap_gpu(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + } + else + { + call_swap(ncvt, vt(isub, 1), ldvt, vt(n + 1 - i, 1), ldvt); + } } - } - if(ncc > 0) - { - if(use_gpu) + if(nru > 0) { - call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + if(use_gpu) + { + call_swap_gpu(nru, u(1, isub), ione, u(1, n + 1 - i), ione); + } + else + { + call_swap(nru, u(1, isub), ione, u(1, n + 1 - i), ione); + } } - else + if(ncc > 0) { - call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + if(use_gpu) + { + call_swap_gpu(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + } + else + { + call_swap(ncc, c(isub, 1), ldc, c(n + 1 - i, 1), ldc); + } } } } @@ -2718,15 +2723,15 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, const rocblas_stride strideD, S* E, const rocblas_stride strideE, - W1 V, + W1 V_arg, const I shiftV, const I ldv, const rocblas_stride strideV, - W2 U, + W2 U_arg, const I shiftU, const I ldu, const rocblas_stride strideU, - W3 C, + W3 C_arg, const I shiftC, const I ldc, const rocblas_stride strideC, @@ -2742,6 +2747,88 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, hipStream_t stream; rocblas_get_stream(handle, &stream); + W1 V = V_arg; + W2 U = U_arg; + W3 C = C_arg; + + auto is_device_pointer = [](void* ptr) -> bool { + hipPointerAttribute_t dev_attributes; + if(ptr == nullptr) + { + return (false); + } + + auto istat = hipPointerGetAttributes(&dev_attributes, ptr); + if(istat != hipSuccess) + { + std::cout << "is_device_pointer: istat = " << istat << " " << hipGetErrorName(istat) + << std::endl; + } + assert(istat == hipSuccess); + return (dev_attributes.type == hipMemoryTypeDevice); + }; + + // --------------------------------------------------- + // handle batch case with array of pointers on device + // --------------------------------------------------- + std::vector Vp_array(batch_count); + std::vector Up_array(batch_count); + std::vector Cp_array(batch_count); + + if(nv > 0) + { + bool const is_device_V_arg = is_device_pointer((void*)V_arg); + if(is_device_V_arg) + { + bool constexpr need_copy_W1 = !std::is_same::value; + if constexpr(need_copy_W1) + { + size_t const nbytes = sizeof(T*) * batch_count; + void* const dst = (void*)&(Vp_array[0]); + void* const src = (void*)V_arg; + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + V = &(Vp_array[0]); + } + } + } + + if(nu > 0) + { + bool const is_device_U_arg = is_device_pointer((void*)U_arg); + if(is_device_U_arg) + { + bool constexpr need_copy_W2 = !std::is_same::value; + if constexpr(need_copy_W2) + { + size_t const nbytes = sizeof(T*) * batch_count; + void* const dst = (void*)&(Up_array[0]); + void* const src = (void*)U_arg; + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + U = &(Up_array[0]); + } + } + } + + if(nc > 0) + { + bool const is_device_C_arg = is_device_pointer((void*)C_arg); + if(is_device_C_arg) + { + bool constexpr need_copy_W3 = !std::is_same::value; + if constexpr(need_copy_W3) + { + size_t const nbytes = sizeof(T*) * batch_count; + void* const dst = (void*)&(Cp_array[0]); + void* const src = (void*)C_arg; + HIP_CHECK(hipMemcpyAsync(dst, src, nbytes, hipMemcpyDefault, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); + C = &(Cp_array[0]); + } + } + } + S* hD = nullptr; S* hE = nullptr; From 7e2dc7e7e3f8fe0b43c96cd72e494e8f04ead428 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Sat, 22 Jun 2024 10:21:11 -0400 Subject: [PATCH 15/35] minor update --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 34a21a212..7438bfe9d 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -1290,8 +1290,21 @@ static void bdsqr_single_template(char uplo, hipStream_t stream = 0) { bool const use_gpu = (dwork_ != nullptr); + + // ----------------------------------- + // Lapack code used O(n^2) algorithm for sorting + // Consider turning off this and rely on + // bdsqr_sort() to perform sorting + // ----------------------------------- bool constexpr need_sort = false; + // --------------------------------------------------- + // NOTE: lasq1 may return non-zero info value that + // has a different meaning + // Consider turning off lasq1 to have consistent info value + // --------------------------------------------------- + bool constexpr use_lasq1 = false; + S const zero = 0; S const one = 1; S negone = -1; @@ -1372,7 +1385,7 @@ static void bdsqr_single_template(char uplo, CHECK_HIP(hipStreamSynchronize(stream)); }; - auto abs = [](auto x) { return ((x >= 0) ? x : (-x)); }; + auto abs = [](auto x) { return (std::abs(x)); }; auto indx2f = [](auto i, auto j, auto ld) -> int64_t { assert((1 <= i) && (i <= ld)); @@ -1457,7 +1470,7 @@ static void bdsqr_single_template(char uplo, /* * if no singular vectors desired, use qd algorithm */ - if(!rotate) + if((!rotate) && (use_lasq1)) { call_lasq1(n, d(1), e(1), work(1), info); /* @@ -1640,7 +1653,7 @@ static void bdsqr_single_template(char uplo, L50: sminoa = sminoa / sqrt(dble(n)); - thresh = max(tol * sminoa, maxitr * (n * (n * unfl))); + thresh = max(tol * sminoa, ((unfl * n) * n) * maxitr); } else { @@ -1648,7 +1661,7 @@ static void bdsqr_single_template(char uplo, * absolute accuracy desired */ - thresh = max(abs(tol) * smax, maxitr * (n * (n * unfl))); + thresh = max(abs(tol) * smax, ((unfl * n) * n) * maxitr); } /* * prepare for main iteration loop for the singular values @@ -2780,7 +2793,12 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, bool const is_device_V_arg = is_device_pointer((void*)V_arg); if(is_device_V_arg) { - bool constexpr need_copy_W1 = !std::is_same::value; + // ------------------------------------------------------------ + // note "T *" and "T * const" may be considered different types + // ------------------------------------------------------------ + bool constexpr is_array_of_device_pointers + = !(std::is_same::value || std::is_same::value); + bool constexpr need_copy_W1 = is_array_of_device_pointers; if constexpr(need_copy_W1) { size_t const nbytes = sizeof(T*) * batch_count; @@ -2798,7 +2816,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, bool const is_device_U_arg = is_device_pointer((void*)U_arg); if(is_device_U_arg) { - bool constexpr need_copy_W2 = !std::is_same::value; + bool constexpr is_array_of_device_pointers + = !(std::is_same::value || std::is_same::value); + bool constexpr need_copy_W2 = is_array_of_device_pointers; if constexpr(need_copy_W2) { size_t const nbytes = sizeof(T*) * batch_count; @@ -2816,7 +2836,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, bool const is_device_C_arg = is_device_pointer((void*)C_arg); if(is_device_C_arg) { - bool constexpr need_copy_W3 = !std::is_same::value; + bool constexpr is_array_of_device_pointers + = !(std::is_same::value || std::is_same::value); + bool constexpr need_copy_W3 = is_array_of_device_pointers; if constexpr(need_copy_W3) { size_t const nbytes = sizeof(T*) * batch_count; From 67dabb17b1ee29a33f5ed2feab8ab526bab7d1a3 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Tue, 25 Jun 2024 21:00:20 -0400 Subject: [PATCH 16/35] option to turn off lapack bdsqr --- library/src/auxiliary/rocauxiliary_bdsqr_host.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 7438bfe9d..81fa0fd47 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -2953,7 +2953,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, I nrv = n; I ncvt = nv; bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0); - if(values_only) + bool const use_lapack_bdsqr = false; + + if((use_lapack_bdsqr) && (values_only)) { // -------------------------------- // call the lapack version of bdsqr @@ -2990,6 +2992,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, { // ---------------------------- // explicitly zero out "E" array + // to be compatible with rocsolver bdsqr // ---------------------------- S const zero = S(0); for(I i = 0; i < (n - 1); i++) From 764eeb031bcfa33d17bc29ecb6e9347c1b6bc897 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Thu, 27 Jun 2024 11:35:54 -0400 Subject: [PATCH 17/35] use c++ version of lamch --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 44 ++++++++++++++----- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 81fa0fd47..3b2d19469 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -52,16 +52,20 @@ ROCSOLVER_BEGIN_NAMESPACE } #endif +#ifndef LASR_MAX_NTHREADS +#define LASR_MAX_NTHREADS 64 +#endif + template -__global__ static void lasr_kernel(char const side, - char const pivot, - char const direct, - I const m, - I const n, - S const* const c_, - S const* const s_, - T* const A_, - I const lda) +__global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char const side, + char const pivot, + char const direct, + I const m, + I const n, + S const* const c_, + S const* const s_, + T* const A_, + I const lda) { const auto nblocks = hipGridDim_x; const auto nthreads_per_block = hipBlockDim_x; @@ -530,7 +534,7 @@ static void lasr_template_gpu(char const side, I const lda, hipStream_t stream = 0) { - auto const nthreads = warpSize; + auto const nthreads = LASR_MAX_NTHREADS; bool const is_left_side = (side == 'L') || (side == 'l'); auto const mn = (is_left_side) ? n : m; @@ -967,6 +971,7 @@ static void call_bdsqr(char& uplo, dbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info); } +#ifdef USE_LAPACK static void call_lamch(char& cmach_arg, double& eps) { char cmach = cmach_arg; @@ -978,6 +983,23 @@ static void call_lamch(char& cmach_arg, float& eps) char cmach = cmach_arg; eps = slamch_(&cmach); } +#else + +static void call_lamch(char& cmach, double& eps) +{ + eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() + : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() + : std::numeric_limits::min(); +} + +static void call_lamch(char& cmach, float& eps) +{ + eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() + : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() + : std::numeric_limits::min(); +} + +#endif static void call_swap(int& n, rocblas_complex_num& zx, @@ -3081,3 +3103,5 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } ROCSOLVER_END_NAMESPACE +#undef LASR_MAX_NTHREADS +#undef CHECK_HIP From 369c15d2f05d3e2285cbf8b8390fe2da1f64e0c1 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Thu, 27 Jun 2024 15:30:09 -0400 Subject: [PATCH 18/35] c++ version of las2 --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 340 ++++++++++++++++++ 1 file changed, 340 insertions(+) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 3b2d19469..db62622db 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -1039,6 +1039,7 @@ static void call_swap(int& n, double& zx, int& incx, double& zy, int& incy) dswap_(&n, &zx, &incx, &zy, &incy); } +#ifdef USE_LAPACK static void call_las2(double& f, double& g, double& h, double& ssmin, double& ssmax) { dlas2_(&f, &g, &h, &ssmin, &ssmax); @@ -1048,6 +1049,80 @@ static void call_las2(float& f, float& g, float& h, float& ssmin, float& ssmax) { slas2_(&f, &g, &h, &ssmin, &ssmax); } +#else + +template +static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax) +{ + T const zero = 0; + T const one = 1; + T const two = 2; + + T as, at, au, c, fa, fhmn, fhmx, ga, ha; + + auto abs = [](auto x) { return (std::abs(x)); }; + auto min = [](auto x, auto y) { return ((x < y) ? x : y); }; + auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; + auto sqrt = [](auto x) { return (std::sqrt(x)); }; + auto square = [](auto x) { return (x * x); }; + + fa = abs(f); + ga = abs(g); + ha = abs(h); + fhmn = min(fa, ha); + fhmx = max(fa, ha); + if(fhmn == zero) + { + ssmin = zero; + if(fhmx == zero) + { + ssmax = ga; + } + else + { + // ssmax = max( fhmx, ga )*sqrt( one+ ( min( fhmx, ga ) / max( fhmx, ga ) )**2 ); + ssmax = max(fhmx, ga) * sqrt(one + square(min(fhmx, ga) / max(fhmx, ga))); + } + } + else + { + if(ga < fhmx) + { + as = one + fhmn / fhmx; + at = (fhmx - fhmn) / fhmx; + au = square(ga / fhmx); + c = two / (sqrt(as * as + au) + sqrt(at * at + au)); + ssmin = fhmn * c; + ssmax = fhmx / c; + } + else + { + au = fhmx / ga; + if(au == zero) + { + // + // avoid possible harmful underflow if exponent range + // asymmetric (true ssmin may not underflow even if + // au underflows) + // + ssmin = (fhmn * fhmx) / ga; + ssmax = ga; + } + else + { + as = one + fhmn / fhmx; + at = (fhmx - fhmn) / fhmx; + // c = one / ( sqrt( one+( as*au )**2 )+ sqrt( one+( at*au )**2 ) ); + c = one / (sqrt(one + square(as * au)) + sqrt(one + square(at * au))); + ssmin = (fhmn * c) * au; + ssmin = ssmin + ssmin; + ssmax = ga / (c + c); + } + } + } +} + +#endif static void call_lartg(double& f, double& g, double& c, double& s, double& r) { @@ -1077,6 +1152,7 @@ static void call_lartg(std::complex& f, zlartg_(&f, &g, &c, &s, &r); } +#ifdef USE_LAPACK static void call_scal(int& n, rocblas_complex_num& da, rocblas_complex_num& zx, int& incx) { cscal_(&n, (std::complex*)&da, (std::complex*)&zx, &incx); @@ -1127,7 +1203,29 @@ static void call_scal(int& n, float& da, float& zx, int& incx) { sscal_(&n, &da, &zx, &incx); } +#else +template +static void call_scal(I& n, S& a, T& x_in, I& incx) +{ + bool const is_zero = (a == 0); + T* const x = &x_in; + for(I i = 0; i < n; i++) + { + auto const ip = i * incx; + if(is_zero) + { + x[ip] = 0; + } + else + { + x[ip] *= a; + } + }; +} +#endif + +#ifdef USE_LAPACK static void call_rot(int& n, std::complex& zx, int& incx, @@ -1181,7 +1279,28 @@ static void call_rot(int& n, float& dx, int& incx, float& dy, int& incy, float& { srot_(&n, &dx, &incx, &dy, &incy, &c, &s); } +#else + +template +static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s) +{ + T* const x = &(x_in); + T* const y = &(y_in); + + for(I i = 0; i < n; i++) + { + auto const ix = i * incx; + auto const iy = i * incy; + + auto const temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + } +} + +#endif +#ifdef USE_LAPACK static void call_lasv2(double& f, double& g, double& h, @@ -1207,6 +1326,227 @@ static void call_lasv2(float& f, { slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl); } +#else +// -------------------------------------------------------- +// lasv2 computes the singular value decomposition of a 2 x 2 +// triangular matrix +// [ F G ] +// [ 0 H ] +// +// on return, +// abs(ssmax) is the larger singular value, +// abs(ssmin) is the smaller singular value, +// (csl,snl) and (csr,snr) are the left and right +// singular vectors for abs(ssmax) +// +// [ csl snl] [ F G ] [ csr -snr] = [ ssmax 0 ] +// [-snl csl] [ 0 H ] [ snr csr] [ 0 ssmin ] +// -------------------------------------------------------- +template +static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& snl, T& csl) +{ + T const zero = 0; + T const one = 1; + T const two = 2; + T const four = 4; + T const half = one / two; + + bool gasmal; + bool swap; + int pmax; + char cmach; + + T a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m; + T mm, r, s, slt, srt, t, temp, tsign, tt; + T macheps; + + auto abs = [](auto x) { return (std::abs(x)); }; + auto sqrt = [](auto x) { return (std::sqrt(x)); }; + auto sign = [](auto a, auto b) { + auto const abs_a = std::abs(a); + return ((b >= 0) ? abs_a : -abs_a); + }; + + ft = f; + fa = abs(ft); + ht = h; + ha = abs(h); + // + // pmax points to the maximum absolute element of matrix + // pmax = 1 if f largest in absolute values + // pmax = 2 if g largest in absolute values + // pmax = 3 if h largest in absolute values + // + pmax = 1; + swap = (ha > fa); + if(swap) + { + pmax = 3; + temp = ft; + ft = ht; + ht = temp; + temp = fa; + fa = ha; + ha = temp; + // + // now fa >= ha + // + } + gt = g; + ga = abs(gt); + if(ga == zero) + { + // + // diagonal matrix + // + ssmin = ha; + ssmax = fa; + clt = one; + crt = one; + slt = zero; + srt = zero; + } + else + { + gasmal = true; + if(ga > fa) + { + pmax = 2; + + cmach = 'E'; + call_lamch(cmach, macheps); + + if((fa / ga) < macheps) + { + // + // case of very large ga + // + gasmal = false; + ssmax = ga; + if(ha > one) + { + ssmin = fa / (ga / ha); + } + else + { + ssmin = (fa / ga) * ha; + } + clt = one; + slt = ht / gt; + srt = one; + crt = ft / gt; + } + } + if(gasmal) + { + // + // normal case + // + d = fa - ha; + if(d == fa) + { + // + // copes with infinite f or h + // + l = one; + } + else + { + l = d / fa; + } + // + // note that 0 <= l <= 1 + // + m = gt / ft; + // + // note that abs(m) <= 1/macheps + // + t = two - l; + // + // note that t >= 1 + // + mm = m * m; + tt = t * t; + s = sqrt(tt + mm); + // + // note that 1 <= s <= 1 + 1/macheps + // + if(l == zero) + { + r = abs(m); + } + else + { + r = sqrt(l * l + mm); + } + // + // note that 0 <= r .le. 1 + 1/macheps + // + a = half * (s + r); + // + // note that 1 <= a .le. 1 + abs(m) + // + ssmin = ha / a; + ssmax = fa * a; + if(mm == zero) + { + // + // note that m is very tiny + // + if(l == zero) + { + t = sign(two, ft) * sign(one, gt); + } + else + { + t = gt / sign(d, ft) + m / t; + } + } + else + { + t = (m / (s + t) + m / (r + l)) * (one + a); + } + l = sqrt(t * t + four); + crt = two / l; + srt = t / l; + clt = (crt + srt * m) / a; + slt = (ht / ft) * srt / a; + } + } + if(swap) + { + csl = srt; + snl = crt; + csr = slt; + snr = clt; + } + else + { + csl = clt; + snl = slt; + csr = crt; + snr = srt; + } + // + // correct signs of ssmax and ssmin + // + if(pmax == 1) + { + tsign = sign(one, csr) * sign(one, csl) * sign(one, f); + } + if(pmax == 2) + { + tsign = sign(one, snr) * sign(one, csl) * sign(one, g); + } + if(pmax == 3) + { + tsign = sign(one, snr) * sign(one, snl) * sign(one, h); + } + ssmax = sign(ssmax, tsign); + ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h)); +} + +#endif static void call_lasq1(int& n, double& D_, double& E_, double& rwork_, int& info_arg) { From a220550a8608b2fb0bdb244c61e2f437a57a25b9 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Thu, 27 Jun 2024 15:37:16 -0400 Subject: [PATCH 19/35] c++ version of swap --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index db62622db..5f18fbc1e 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -1001,6 +1001,7 @@ static void call_lamch(char& cmach, float& eps) #endif +#ifdef USE_LAPACK static void call_swap(int& n, rocblas_complex_num& zx, int& incx, @@ -1038,6 +1039,25 @@ static void call_swap(int& n, double& zx, int& incx, double& zy, int& incy) { dswap_(&n, &zx, &incx, &zy, &incy); } +#else + +template +static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy) +{ + T* const x = &(x_in); + T* const y = &(y_in); + for(I i = 0; i < n; i++) + { + I const ix = i * incx; + I const iy = i * incy; + + T const temp = x[ix]; + x[ix] = y[iy]; + y[iy] = temp; + } +} + +#endif #ifdef USE_LAPACK static void call_las2(double& f, double& g, double& h, double& ssmin, double& ssmax) From cc701a7fcf767415ff004720aa9799d5a2aa4713 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Thu, 27 Jun 2024 21:52:53 -0400 Subject: [PATCH 20/35] c++ version of lartg --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 284 ++++++++++++++++++ 1 file changed, 284 insertions(+) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 5f18fbc1e..ab60e3789 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -989,6 +989,7 @@ static void call_lamch(char& cmach, double& eps) { eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() + : ((cmach == 'B') || (cmach == 's')) ? FLT_RADIX : std::numeric_limits::min(); } @@ -996,6 +997,7 @@ static void call_lamch(char& cmach, float& eps) { eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() + : ((cmach == 'B') || (cmach == 's')) ? FLT_RADIX : std::numeric_limits::min(); } @@ -1144,6 +1146,8 @@ static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax) #endif +#ifdef USE_LAPACK + static void call_lartg(double& f, double& g, double& c, double& s, double& r) { dlartg_(&f, &g, &c, &s, &r); @@ -1171,6 +1175,286 @@ static void call_lartg(std::complex& f, { zlartg_(&f, &g, &c, &s, &r); } +#else + +static float real_part(float z) +{ + return (z); +}; +static float real_part(std::complex z) +{ + return (z.real()); +}; +static float real_part(rocblas_complex_num z) +{ + return (z.real()); +}; + +static double real_part(double z) +{ + return (z); +}; +static double real_part(std::complex z) +{ + return (z.real()); +}; +static double real_part(rocblas_complex_num z) +{ + return (z.real()); +}; + +static float imag_part(float z) +{ + return (0); +}; +static float imag_part(std::complex z) +{ + return (z.imag()); +}; +static float imag_part(rocblas_complex_num z) +{ + return (z.imag()); +}; + +static double imag_part(double z) +{ + return (0); +}; +static double imag_part(std::complex z) +{ + return (z.imag()); +}; +static double imag_part(rocblas_complex_num z) +{ + return (z.imag()); +}; + +template +static void call_lartg(T& f, T& g, S& cs, T& sn, T& r) +{ + // ------------------------------------------------------ + // lartg generates a plane rotation so that + // [ cs sn ] * [ f ] = [ r ] + // [ -sn cs ] [ g ] [ 0 ] + // + // where cs * cs + abs(sn)*abs(sn) == 1 + // if g == 0, then cs == 1, sn == 0 + // if f == 0, then cs = 0, sn is chosen so that r is real + // ------------------------------------------------------ + + auto Not = [](bool x) { return (!x); }; + auto abs = [](auto x) { return (std::abs(x)); }; + auto dble = [](auto z) { return (static_cast(real_part(z))); }; + auto dimag = [](auto z) { return (static_cast(imag_part(z))); }; + auto log = [](auto x) { return (std::log(x)); }; + auto sqrt = [](auto x) { return (std::sqrt(x)); }; + auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; + auto disnan = [](auto x) -> bool { return (isnan(x)); }; + auto dcmplx = [](auto x, auto y) -> T { + bool constexpr is_complex_type + = !(std::is_same::value || std::is_same::value); + + if constexpr(is_complex_type) + { + return (T(x, y)); + } + else + { + return (T(x)); + }; + }; + auto dconjg = [&](auto z) { return (dcmplx(dble(z), -dimag(z))); }; + + auto square = [](auto x) { return (x * x); }; + + auto abs1 = [&](auto ff) { return (max(abs(dble(ff)), abs(dimag(ff)))); }; + auto abssq = [&](auto ff) { return (square(dble(ff)) + square(dimag(ff))); }; + + // ----------------------------------------- + // compute sqrt( x * x + y * y ) + // without unnecessary overflow or underflow + // ----------------------------------------- + auto dlapy2 = [&](auto x, auto y) { + auto const one = 1; + auto const zero = 0; + + auto ddlapy2 = x; + bool const x_is_nan = disnan(x); + bool const y_is_nan = disnan(y); + if(x_is_nan) + ddlapy2 = x; + if(y_is_nan) + ddlapy2 = y; + + if(Not(x_is_nan || y_is_nan)) + { + auto const xabs = abs(x); + auto const yabs = abs(y); + auto const w = max(xabs, yabs); + auto const z = min(xabs, yabs); + if(z == zero) + { + ddlapy2 = w; + } + else + { + ddlapy2 = w * sqrt(one + square(z / w)); + } + } + return (ddlapy2); + }; + + char cmach = 'E'; + S const zero = 0; + S const one = 1; + S const two = 2; + T const czero = 0; + + bool has_work; + bool first; + int count, i; + S d, di, dr, eps, f2, f2s, g2, g2s, safmin; + S safmn2, safmx2, scale; + T ff, fs, gs; + + // safmin = dlamch( 's' ) + cmach = 'S'; + call_lamch(cmach, safmin); + + // eps = dlamch( 'e' ) + cmach = 'E'; + call_lamch(cmach, eps); + + // safmn2 = dlamch( 'b' )**int( log( safmin / eps ) / log( dlamch( 'b' ) ) / two ) + cmach = 'B'; + S radix = 2; + call_lamch(cmach, radix); + + int const npow = (log(safmin / eps) / log(radix) / two); + safmn2 = std::pow(radix, npow); + safmx2 = one / safmn2; + scale = max(abs1(f), abs1(g)); + fs = f; + gs = g; + count = 0; + + if(scale >= safmx2) + { + L10: + do + { + count = count + 1; + fs = fs * safmn2; + gs = gs * safmn2; + scale = scale * safmn2; + // if( (scale >= safmx2) && (count < 20) ) go to L10 + has_work = ((scale >= safmx2) && (count < 20)); + } while(has_work); + } + else + { + if(scale <= safmn2) + { + if((g == czero) || disnan(abs(g))) + { + cs = one; + sn = czero; + r = f; + return; + } + L20: + do + { + count = count - 1; + fs = fs * safmx2; + gs = gs * safmx2; + scale = scale * safmx2; + // if( scale <= safmn2 ) goto L20; + has_work = (scale <= safmn2); + } while(has_work); + } + f2 = abssq(fs); + g2 = abssq(gs); + if(f2 <= max(g2, one) * safmin) + { + // + // this is a rare case: f is very small. + // + if(f == czero) + { + cs = zero; + r = dlapy2(dble(g), dimag(g)); + // do complex/real division explicitly with two real divisions + d = dlapy2(dble(gs), dimag(gs)); + sn = dcmplx(dble(gs) / d, -dimag(gs) / d); + return; + } + f2s = dlapy2(dble(fs), dimag(fs)); + // g2 and g2s are accurate + // g2 is at least safmin, and g2s is at least safmn2 + g2s = sqrt(g2); + // error in cs from underflow in f2s is at most + // unfl / safmn2 < sqrt(unfl*eps) .lt. eps + // if max(g2,one)=g2, then f2 < g2*safmin, + // and so cs < sqrt(safmin) + // if max(g2,one)=one, then f2 < safmin + // and so cs < sqrt(safmin)/safmn2 = sqrt(eps) + // therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s + cs = f2s / g2s; + // make sure abs(ff) = 1 + // do complex/real division explicitly with 2 real divisions + if(abs1(f) > one) + { + d = dlapy2(dble(f), dimag(f)); + ff = dcmplx(dble(f) / d, dimag(f) / d); + } + else + { + dr = safmx2 * dble(f); + di = safmx2 * dimag(f); + d = dlapy2(dr, di); + ff = dcmplx(dr / d, di / d); + } + sn = ff * dcmplx(dble(gs) / g2s, -dimag(gs) / g2s); + r = cs * f + sn * g; + } + else + { + // + // this is the most common case. + // neither f2 nor f2/g2 are less than safmin + // f2s cannot overflow, and it is accurate + // + f2s = sqrt(one + g2 / f2); + // do the f2s(real)*fs(complex) multiply with two real multiplies + r = dcmplx(f2s * dble(fs), f2s * dimag(fs)); + cs = one / f2s; + d = f2 + g2; + // do complex/real division explicitly with two real divisions + sn = dcmplx(dble(r) / d, dimag(r) / d); + sn = sn * dconjg(gs); + if(count != 0) + { + if(count > 0) + { + for(i = 1; i <= count; i++) + { + r = r * safmx2; + }; + } + else + { + for(i = 1; i <= -count; i++) + { + r = r * safmn2; + } + } + } + } + } +} + +#endif #ifdef USE_LAPACK static void call_scal(int& n, rocblas_complex_num& da, rocblas_complex_num& zx, int& incx) From 9fb6ceb3fba36931318d3a48d7e1d8dbe0ef42c2 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Thu, 27 Jun 2024 23:46:28 -0400 Subject: [PATCH 21/35] c++ version of lasr --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 60 +++++++++++++------ 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index ab60e3789..5244534f4 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -57,24 +57,18 @@ ROCSOLVER_BEGIN_NAMESPACE #endif template -__global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char const side, - char const pivot, - char const direct, - I const m, - I const n, - S const* const c_, - S const* const s_, - T* const A_, - I const lda) +__host__ __device__ static void lasr_body(char const side, + char const pivot, + char const direct, + I const m, + I const n, + S const* const c_, + S const* const s_, + T* const A_, + I const lda, + I const tid, + I const i_inc) { - const auto nblocks = hipGridDim_x; - const auto nthreads_per_block = hipBlockDim_x; - const auto nthreads = nblocks * nthreads_per_block; - const auto tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - const auto i_inc = nthreads; - const auto ij_nb = nthreads; - const auto ij_start = tid; - auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; auto min = [](auto x, auto y) { return ((x < y) ? x : y); }; @@ -522,6 +516,26 @@ __global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char con return; } +template +__global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char const side, + char const pivot, + char const direct, + I const m, + I const n, + S const* const c_, + S const* const s_, + T* const A_, + I const lda) +{ + const auto nblocks = hipGridDim_x; + const auto nthreads_per_block = hipBlockDim_x; + const auto nthreads = nblocks * nthreads_per_block; + I const tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = nthreads; + + lasr_body(side, pivot, direct, m, n, c_, s_, A_, lda, tid, i_inc); +} + template static void lasr_template_gpu(char const side, char const pivot, @@ -1862,6 +1876,7 @@ static void call_lasq1(int& n, float& D_, float& E_, float& rwork_, int& info_ar slasq1_(&n, &D_, &E_, &rwork_, &info_arg); }; +#ifdef USE_LAPACK static void call_lasr(char& side, char& pivot, char& direct, @@ -1932,6 +1947,17 @@ static void call_lasr(char& side, { dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); }; +#else +template +static void call_lasr(char& side, char& pivot, char& direct, I& m, I& n, S& c, S& s, T& A, I& lda) +{ + I const tid = 0; + I const i_inc = 1; + + lasr_body(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc); +}; + +#endif template static void bdsqr_single_template(char uplo, From c9a802de1258d9165673926b20f593db23c951f4 Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Fri, 28 Jun 2024 16:20:28 -0400 Subject: [PATCH 22/35] add restrict hint to lasr --- library/src/auxiliary/rocauxiliary_bdsqr_host.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index 5244534f4..f94deb344 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -62,9 +62,9 @@ __host__ __device__ static void lasr_body(char const side, char const direct, I const m, I const n, - S const* const c_, - S const* const s_, - T* const A_, + S const* const __restrict__ c_, + S const* const __restrict__ s_, + T* const __restrict__ A_, I const lda, I const tid, I const i_inc) @@ -77,17 +77,16 @@ __host__ __device__ static void lasr_body(char const side, assert((1 <= lda)); assert((1 <= j)); - // return ((i - 1) + (j - 1) * int64_t(lda)); return (i + j * lda - (1 + lda)); }; auto indx1f = [](auto i) -> int64_t { assert((1 <= i)); - return (i - int64_t(1)); + return (i - (1)); }; - auto c = [&](auto i) -> const S& { return (c_[indx1f(i)]); }; - auto s = [&](auto i) -> const S& { return (s_[indx1f(i)]); }; + auto c = [&](auto i) -> const S { return (c_[(i)-1]); }; + auto s = [&](auto i) -> const S { return (s_[(i)-1]); }; auto A = [&](auto i, auto j) -> T& { return (A_[indx2f(i, j, lda)]); }; const S one = 1; From 7ce4cc51dce8278aa85e35a9f861335a19d2ec4c Mon Sep 17 00:00:00 2001 From: Eduardo D'Azevedo Date: Fri, 28 Jun 2024 23:15:08 -0400 Subject: [PATCH 23/35] minor change --- .../src/auxiliary/rocauxiliary_bdsqr_host.hpp | 68 ++++++++++++++----- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp index f94deb344..3bf801519 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp @@ -92,6 +92,8 @@ __host__ __device__ static void lasr_body(char const side, const S one = 1; const S zero = 0; + constexpr bool use_reorder = true; + // ---------------- // check arguments // ---------------- @@ -141,21 +143,38 @@ __host__ __device__ static void lasr_body(char const side, // P = P(z-1) * ... * P(2) * P(1) // ----------------------------- { - for(I j = 1; j <= (m - 1); j++) + if constexpr(use_reorder) { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) + for(I i = 1 + tid; i <= n; i += i_inc) { - for(I i = 1 + tid; i <= n; i += i_inc) + for(I j = 1; j <= (m - 1); j++) { + const auto ctemp = c(j); + const auto stemp = s(j); const auto temp = A(j + 1, i); A(j + 1, i) = ctemp * temp - stemp * A(j, i); A(j, i) = stemp * temp + ctemp * A(j, i); } - }; - }; - }; + } + } + else + { + for(I j = 1; j <= (m - 1); j++) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = 1 + tid; i <= n; i += i_inc) + { + const auto temp = A(j + 1, i); + A(j + 1, i) = ctemp * temp - stemp * A(j, i); + A(j, i) = stemp * temp + ctemp * A(j, i); + } + } + } + } + } return; }; @@ -173,20 +192,37 @@ __host__ __device__ static void lasr_body(char const side, auto const istart = 1; auto const iend = n; - for(I j = jend; j >= jstart; j--) + if constexpr(use_reorder) { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) + for(I i = istart + tid; i <= iend; i += i_inc) { - for(I i = istart + tid; i <= iend; i += i_inc) + for(I j = jend; j >= jstart; j--) { + const auto ctemp = c(j); + const auto stemp = s(j); const auto temp = A(j + 1, i); A(j + 1, i) = ctemp * temp - stemp * A(j, i); A(j, i) = stemp * temp + ctemp * A(j, i); - }; - }; - }; + } + } + } + else + { + for(I j = jend; j >= jstart; j--) + { + const auto ctemp = c(j); + const auto stemp = s(j); + if((ctemp != one) || (stemp != zero)) + { + for(I i = istart + tid; i <= iend; i += i_inc) + { + const auto temp = A(j + 1, i); + A(j + 1, i) = ctemp * temp - stemp * A(j, i); + A(j, i) = stemp * temp + ctemp * A(j, i); + } + } + } + } return; }; From d66f64110ebf7dfb6b2aa949f3a9b158175868b8 Mon Sep 17 00:00:00 2001 From: Troy Alderson <58866654+tfalders@users.noreply.github.com> Date: Tue, 5 Nov 2024 09:11:46 -0700 Subject: [PATCH 24/35] Added algorithm selection infrastructure (#12) * Remove LAPACK calls * Added basic hybrid algorithm infrastructure * Allow selection of algorithm mode per function * Fix test failure * Added alg_mode argument to benchmark client * Added rocsolver_function_gesvd * Added basic checksum to rocsolver_handle_data * Updated documentation * Updated changelog * Renamed bdsqr_host to bdsqr_hybrid * Bump minimum rocBLAS version --- CHANGELOG.md | 9 + CMakeLists.txt | 2 +- clients/benchmarks/client.cpp | 7 + clients/common/auxiliary/testing_bdsqr.hpp | 5 + clients/common/lapack/testing_gesvd.hpp | 5 + clients/common/misc/rocsolver_arguments.hpp | 2 + clients/gtest/auxiliary/bdsqr_gtest.cpp | 40 +- clients/gtest/lapack/gesvd_gtest.cpp | 83 ++- docs/index.rst | 4 +- docs/reference/{logging.rst => helpers.rst} | 61 +- docs/reference/index.rst | 8 +- docs/reference/types.rst | 8 + library/include/rocsolver/rocsolver-aliases.h | 4 +- .../include/rocsolver/rocsolver-extra-types.h | 19 +- .../include/rocsolver/rocsolver-functions.h | 67 +- library/src/CMakeLists.txt | 1 + library/src/auxiliary/rocauxiliary_bdsqr.hpp | 127 ++-- ...host.hpp => rocauxiliary_bdsqr_hybrid.hpp} | 672 +----------------- library/src/common/rocsolver_handle.cpp | 132 ++++ library/src/include/rocsolver_handle.hpp | 44 ++ 20 files changed, 531 insertions(+), 769 deletions(-) rename docs/reference/{logging.rst => helpers.rst} (79%) rename library/src/auxiliary/{rocauxiliary_bdsqr_host.hpp => rocauxiliary_bdsqr_hybrid.hpp} (83%) create mode 100644 library/src/common/rocsolver_handle.cpp create mode 100644 library/src/include/rocsolver_handle.hpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 013478027..117cc3e8f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,16 @@ Full documentation for rocSOLVER is available at the [rocSOLVER documentation](https://rocm.docs.amd.com/projects/rocSOLVER/en/latest/index.html). ## (Unreleased) rocSOLVER + ### Added + +* Algorithm selection APIs for hybrid computation +* SVD of bidiagonal matrices routine: + - BDSQR now supports hybrid computation +* SVD of general matrices routine: + - GESVD now supports hybrid computation + +### Optimized ### Changed ### Removed ### Optimized diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c4ff627e..9c436d304 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,7 +197,7 @@ find_package(hip REQUIRED CONFIG PATHS ${ROCM_PATH} /opt/rocm) find_package(rocblas REQUIRED CONFIG PATHS ${ROCM_PATH}) get_imported_target_location(location roc::rocblas) message(STATUS "Found rocBLAS: ${location}") -set(rocblas_minimum 4.1) +set(rocblas_minimum 4.4) rocm_package_add_dependencies(SHARED_DEPENDS "rocblas >= ${rocblas_minimum}") rocm_package_add_rpm_dependencies(STATIC_DEPENDS "rocblas-static-devel >= ${rocblas_minimum}") rocm_package_add_deb_dependencies(STATIC_DEPENDS "rocblas-static-dev >= ${rocblas_minimum}") diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index d9e28a9c2..981b86017 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -125,6 +125,13 @@ try " Reported time will be the average.\n" " ") + ("alg_mode", + value(&argus.alg_mode)->default_value(0), + "Select different algorithm mode? 0 = GPU-only, 1 = Hybrid\n" + " This will change how the algorithm operates.\n" + " Only applicable to functions with hybrid support." + " ") + ("mem_query", value(&argus.mem_query)->default_value(0), "Calculate the required amount of device workspace memory? 0 = No, 1 = Yes.\n" diff --git a/clients/common/auxiliary/testing_bdsqr.hpp b/clients/common/auxiliary/testing_bdsqr.hpp index 2f2a984f4..5946bb93a 100644 --- a/clients/common/auxiliary/testing_bdsqr.hpp +++ b/clients/common/auxiliary/testing_bdsqr.hpp @@ -478,6 +478,11 @@ void testing_bdsqr(Arguments& argus) rocblas_fill uplo = char2rocblas_fill(uploC); rocblas_int hot_calls = argus.iters; + if(argus.alg_mode) + EXPECT_ROCBLAS_STATUS( + rocsolver_set_alg_mode(handle, rocsolver_function_bdsqr, rocsolver_alg_mode_hybrid), + rocblas_status_success); + // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { diff --git a/clients/common/lapack/testing_gesvd.hpp b/clients/common/lapack/testing_gesvd.hpp index 2fbf49dcd..8f5745826 100644 --- a/clients/common/lapack/testing_gesvd.hpp +++ b/clients/common/lapack/testing_gesvd.hpp @@ -548,6 +548,11 @@ void testing_gesvd(Arguments& argus) rocblas_int bc = argus.batch_count; rocblas_int hot_calls = argus.iters; + if(argus.alg_mode) + EXPECT_ROCBLAS_STATUS( + rocsolver_set_alg_mode(handle, rocsolver_function_gesvd, rocsolver_alg_mode_hybrid), + rocblas_status_success); + // check non-supported values if(rightv == rocblas_svect_overwrite && leftv == rocblas_svect_overwrite) { diff --git a/clients/common/misc/rocsolver_arguments.hpp b/clients/common/misc/rocsolver_arguments.hpp index 34006f51a..bca8535b6 100644 --- a/clients/common/misc/rocsolver_arguments.hpp +++ b/clients/common/misc/rocsolver_arguments.hpp @@ -33,6 +33,7 @@ #include #include #include +#include #include "program_options.hpp" @@ -54,6 +55,7 @@ class Arguments : private std::map rocblas_int perf = 0; rocblas_int singular = 0; rocblas_int iters = 5; + rocblas_int alg_mode = 0; rocblas_int mem_query = 0; rocblas_int profile = 0; rocblas_int profile_kernels = 0; diff --git a/clients/gtest/auxiliary/bdsqr_gtest.cpp b/clients/gtest/auxiliary/bdsqr_gtest.cpp index b963f6029..4e78b0bea 100644 --- a/clients/gtest/auxiliary/bdsqr_gtest.cpp +++ b/clients/gtest/auxiliary/bdsqr_gtest.cpp @@ -112,7 +112,8 @@ Arguments bdsqr_setup_arguments(bdsqr_tuple tup) return arg; } -class BDSQR : public ::TestWithParam +template +class BDSQR_BASE : public ::TestWithParam { protected: void TearDown() override @@ -124,6 +125,7 @@ class BDSQR : public ::TestWithParam void run_tests() { Arguments arg = bdsqr_setup_arguments(GetParam()); + arg.alg_mode = MODE; if(arg.peek("n") == 0 && arg.peek("uplo") == 'L') testing_bdsqr_bad_arg(); @@ -132,6 +134,14 @@ class BDSQR : public ::TestWithParam } }; +class BDSQR : public BDSQR_BASE<0> +{ +}; + +class BDSQR_HYBRID : public BDSQR_BASE<1> +{ +}; + // non-batch tests TEST_P(BDSQR, __float) @@ -154,8 +164,36 @@ TEST_P(BDSQR, __double_complex) run_tests(); } +TEST_P(BDSQR_HYBRID, __float) +{ + run_tests(); +} + +TEST_P(BDSQR_HYBRID, __double) +{ + run_tests(); +} + +TEST_P(BDSQR_HYBRID, __float_complex) +{ + run_tests(); +} + +TEST_P(BDSQR_HYBRID, __double_complex) +{ + run_tests(); +} + INSTANTIATE_TEST_SUITE_P(daily_lapack, BDSQR, Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); INSTANTIATE_TEST_SUITE_P(checkin_lapack, BDSQR, Combine(ValuesIn(size_range), ValuesIn(opt_range))); + +INSTANTIATE_TEST_SUITE_P(daily_lapack, + BDSQR_HYBRID, + Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, + BDSQR_HYBRID, + Combine(ValuesIn(size_range), ValuesIn(opt_range))); diff --git a/clients/gtest/lapack/gesvd_gtest.cpp b/clients/gtest/lapack/gesvd_gtest.cpp index 51acceda3..3ee770c1d 100644 --- a/clients/gtest/lapack/gesvd_gtest.cpp +++ b/clients/gtest/lapack/gesvd_gtest.cpp @@ -158,7 +158,8 @@ Arguments gesvd_setup_arguments(gesvd_tuple tup) return arg; } -class GESVD : public ::TestWithParam +template +class GESVD_BASE : public ::TestWithParam { protected: void TearDown() override @@ -170,6 +171,7 @@ class GESVD : public ::TestWithParam void run_tests() { Arguments arg = gesvd_setup_arguments(GetParam()); + arg.alg_mode = MODE; if(arg.peek("m") == 0 && arg.peek("n") == 0 && arg.peek("left_svect") == 'N' && arg.peek("right_svect") == 'N') @@ -180,6 +182,14 @@ class GESVD : public ::TestWithParam } }; +class GESVD : public GESVD_BASE<0> +{ +}; + +class GESVD_HYBRID : public GESVD_BASE<1> +{ +}; + // non-batch tests TEST_P(GESVD, __float) @@ -202,6 +212,26 @@ TEST_P(GESVD, __double_complex) run_tests(); } +TEST_P(GESVD_HYBRID, __float) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, __double) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, __float_complex) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, __double_complex) +{ + run_tests(); +} + // batched tests TEST_P(GESVD, batched__float) @@ -224,6 +254,26 @@ TEST_P(GESVD, batched__double_complex) run_tests(); } +TEST_P(GESVD_HYBRID, batched__float) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, batched__double) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, batched__float_complex) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, batched__double_complex) +{ + run_tests(); +} + // strided_batched tests TEST_P(GESVD, strided_batched__float) @@ -246,11 +296,36 @@ TEST_P(GESVD, strided_batched__double_complex) run_tests(); } -// daily_lapack tests normal execution with medium to large sizes +TEST_P(GESVD_HYBRID, strided_batched__float) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, strided_batched__double) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, strided_batched__float_complex) +{ + run_tests(); +} + +TEST_P(GESVD_HYBRID, strided_batched__double_complex) +{ + run_tests(); +} + INSTANTIATE_TEST_SUITE_P(daily_lapack, GESVD, Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); -// checkin_lapack tests normal execution with small sizes, invalid sizes, -// quick returns, and corner cases INSTANTIATE_TEST_SUITE_P(checkin_lapack, GESVD, Combine(ValuesIn(size_range), ValuesIn(opt_range))); + +INSTANTIATE_TEST_SUITE_P(daily_lapack, + GESVD_HYBRID, + Combine(ValuesIn(large_size_range), ValuesIn(large_opt_range))); + +INSTANTIATE_TEST_SUITE_P(checkin_lapack, + GESVD_HYBRID, + Combine(ValuesIn(size_range), ValuesIn(opt_range))); diff --git a/docs/index.rst b/docs/index.rst index d7725421c..e1c39a266 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -8,7 +8,7 @@ rocSOLVER documentation ******************************************************************** -rocSOLVER is an implementation of `LAPACK routines `_ +rocSOLVER is an implementation of `LAPACK routines `_ on top of the :doc:`AMD ROCm platform `. rocSOLVER is implemented in the :doc:`HIP programming language ` and optimized for AMD GPUs. @@ -39,7 +39,7 @@ The rocSOLVER documentation is structured as follows: * :ref:`lapackfunc` * :ref:`lapack-like` * :ref:`refactor` - * :ref:`api_logging` + * :ref:`helpers` * :ref:`tuning_label` * :ref:`deprecated` diff --git a/docs/reference/logging.rst b/docs/reference/helpers.rst similarity index 79% rename from docs/reference/logging.rst rename to docs/reference/helpers.rst index 26c02e42f..b643b6807 100644 --- a/docs/reference/logging.rst +++ b/docs/reference/helpers.rst @@ -2,12 +2,50 @@ :description: rocSOLVER documentation and API reference library :keywords: rocSOLVER, ROCm, API, documentation -.. _api_logging: +.. _helpers: ***************************************************** -rocSOLVER Logging Functions and Library Information +rocSOLVER Library and Logging Functions ***************************************************** + + +Library information +=============================== + +.. contents:: List of library information functions + :local: + :backlinks: top + +rocsolver_get_version_string() +------------------------------------ +.. doxygenfunction:: rocsolver_get_version_string + +rocsolver_get_version_string_size() +------------------------------------ +.. doxygenfunction:: rocsolver_get_version_string_size + + + +Algorithm selection +=============================== + +.. contents:: List of algorithm selection functions + :local: + :backlinks: top + +rocsolver_set_alg_mode() +------------------------------------ +.. doxygenfunction:: rocsolver_set_alg_mode + +rocsolver_get_alg_mode() +------------------------------------ +.. doxygenfunction:: rocsolver_get_alg_mode + + + +.. _api_logging: + Logging functions =============================== @@ -45,22 +83,3 @@ rocsolver_log_flush_profile() --------------------------------- .. doxygenfunction:: rocsolver_log_flush_profile - - -.. _libraryinfo: - -Library information -=============================== - -.. contents:: List of library information functions - :local: - :backlinks: top - -rocsolver_get_version_string() ------------------------------------- -.. doxygenfunction:: rocsolver_get_version_string - -rocsolver_get_version_string_size() ------------------------------------- -.. doxygenfunction:: rocsolver_get_version_string_size - diff --git a/docs/reference/index.rst b/docs/reference/index.rst index b82b12dc2..18a2fb338 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -8,15 +8,15 @@ Reference ######################################## -This section provides technical descriptions and important information about -the different rocSOLVER APIs and library components. +This section provides technical descriptions and important information about +the different rocSOLVER APIs and library components. * :ref:`intro` * :ref:`rocsolver-types` -* :ref:`rocsolver_auxiliary_functions` +* :ref:`rocsolver_auxiliary_functions` * :ref:`lapackfunc` * :ref:`lapack-like` * :ref:`refactor` -* :ref:`api_logging` +* :ref:`helpers` * :ref:`tuning_label` * :ref:`deprecated` diff --git a/docs/reference/types.rst b/docs/reference/types.rst index 8191098ba..9de6bcc35 100644 --- a/docs/reference/types.rst +++ b/docs/reference/types.rst @@ -66,3 +66,11 @@ rocsolver_rfinfo rocsolver_rfinfo_mode ------------------------ .. doxygenenum:: rocsolver_rfinfo_mode + +rocsolver_alg_mode +------------------------ +.. doxygentypedef:: rocsolver_alg_mode + +rocsolver_function +------------------------ +.. doxygentypedef:: rocsolver_function diff --git a/library/include/rocsolver/rocsolver-aliases.h b/library/include/rocsolver/rocsolver-aliases.h index fec1ea401..de938bf22 100644 --- a/library/include/rocsolver/rocsolver-aliases.h +++ b/library/include/rocsolver/rocsolver-aliases.h @@ -1,5 +1,5 @@ /* ************************************************************************** - * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -91,7 +91,7 @@ ROCSOLVER_DEPRECATED_X("use rocblas_fill") typedef rocblas_fill rocsolver_fill; */ ROCSOLVER_DEPRECATED_X("use rocblas_diagonal") typedef rocblas_diagonal rocsolver_diagonal; -/*! \deprecated Use \c rocblas_stide. +/*! \deprecated Use \c rocblas_side. */ ROCSOLVER_DEPRECATED_X("use rocblas_side") typedef rocblas_side rocsolver_side; diff --git a/library/include/rocsolver/rocsolver-extra-types.h b/library/include/rocsolver/rocsolver-extra-types.h index 89ea80692..d423a69e6 100644 --- a/library/include/rocsolver/rocsolver-extra-types.h +++ b/library/include/rocsolver/rocsolver-extra-types.h @@ -1,5 +1,5 @@ /* ************************************************************************** - * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -164,4 +164,21 @@ typedef enum rocsolver_rfinfo_mode_ = 272, /**< To work with Cholesky factorization (for symmetric positive definite sparse matrices). */ } rocsolver_rfinfo_mode; +/*! \brief Used by specific functions to specify the algorithm mode. + ********************************************************************************/ +typedef enum rocsolver_alg_mode_ +{ + rocsolver_alg_mode_gpu + = 281, /**< Computations are all performed on the GPU. This is the default mode. */ + rocsolver_alg_mode_hybrid = 282, /**< Computations are performed on the CPU and GPU. */ +} rocsolver_alg_mode; + +/*! \brief Used to specify a function with multiple supported algorithm modes. + ********************************************************************************/ +typedef enum rocsolver_function_ +{ + rocsolver_function_bdsqr = 401, + rocsolver_function_gesvd = 402, +} rocsolver_function; + #endif /* ROCSOLVER_EXTRA_TYPES_H */ diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h index 250d07b2a..eec72d3c8 100644 --- a/library/include/rocsolver/rocsolver-functions.h +++ b/library/include/rocsolver/rocsolver-functions.h @@ -141,6 +141,45 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_log_write_profile(void); ROCSOLVER_EXPORT rocblas_status rocsolver_log_flush_profile(void); +/* + * =========================================================================== + * Hybrid algorithm enablement + * =========================================================================== + */ + +/*! \brief SET_ALG_MODE sets the algorithm mode to be used by a specific function. + + @param[in] + handle rocblas_handle. + @param[in] + func #rocsolver_function. + The function that will use the selected algorithm mode. + @param[in] + mode #rocsolver_alg_mode. + The algorithm mode that will be used by the specified function. + *************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_set_alg_mode(rocblas_handle handle, + const rocsolver_function func, + const rocsolver_alg_mode mode); + +/*! \brief GET_ALG_MODE gets the algorithm mode selected for use by a specific function. + + @param[in] + handle rocblas_handle. + @param[in] + func #rocsolver_function. + A function. + @param[out] + mode pointer to #rocsolver_alg_mode. + On exit, the value is overwritten by the algorithm mode that will + be used by the specified function. + *************************************************************************/ + +ROCSOLVER_EXPORT rocblas_status rocsolver_get_alg_mode(rocblas_handle handle, + const rocsolver_function func, + rocsolver_alg_mode* mode); + /* * =========================================================================== * Auxiliary functions @@ -3708,6 +3747,10 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zunmtr(rocblas_handle handle, In order to carry out calculations, this method may synchronize the stream contained within the rocblas_handle. + \note + A hybrid (CPU+GPU) approach is available for BDSQR. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE" + to enable it. + @param[in] handle rocblas_handle. @param[in] @@ -12443,8 +12486,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zpotri_strided_batched(rocblas_handle the "Tuning rocSOLVER performance" and "Memory model" sections of the documentation. \note - In order to carry out calculations, this method may synchronize the stream contained within the - rocblas_handle. + In order to carry out calculations, this method may synchronize the stream contained + within the rocblas_handle. + + \note + A hybrid (CPU+GPU) approach is available for GESVD. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE" + to enable it. @param[in] handle rocblas_handle. @@ -12620,8 +12667,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd(rocblas_handle handle, and "Memory model" sections of the documentation. \note - In order to carry out calculations, this method may synchronize the stream contained within the - rocblas_handle. + In order to carry out calculations, this method may synchronize the stream contained + within the rocblas_handle. + + \note + A hybrid (CPU+GPU) approach is available for GESVD_BATCHED. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE" + to enable it. @param[in] handle rocblas_handle. @@ -12838,8 +12889,12 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd_batched(rocblas_handle handle, and "Memory model" sections of the documentation. \note - In order to carry out calculations, this method may synchronize the stream contained within the - rocblas_handle. + In order to carry out calculations, this method may synchronize the stream contained + within the rocblas_handle. + + \note + A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE" + to enable it. @param[in] handle rocblas_handle. diff --git a/library/src/CMakeLists.txt b/library/src/CMakeLists.txt index e31b6ece0..5b2e75e34 100755 --- a/library/src/CMakeLists.txt +++ b/library/src/CMakeLists.txt @@ -343,6 +343,7 @@ endif() set(auxiliaries common/buildinfo.cpp + common/rocsolver_handle.cpp common/rocsolver_logger.cpp common/rocsparse.cpp ) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp index 51c4b11b3..89397dbc8 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp @@ -38,7 +38,7 @@ #include -#include "rocauxiliary_bdsqr_host.hpp" +#include "rocauxiliary_bdsqr_hybrid.hpp" ROCSOLVER_BEGIN_NAMESPACE @@ -1209,6 +1209,9 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle, hipStream_t stream; rocblas_get_stream(handle, &stream); + rocsolver_alg_mode alg_mode; + ROCBLAS_CHECK(rocsolver_get_alg_mode(handle, rocsolver_function_bdsqr, &alg_mode)); + // set tolerance and max number of iterations: // machine precision (considering rounding strategy) S eps = get_epsilon() / 2; @@ -1251,79 +1254,83 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle, ROCSOLVER_LAUNCH_KERNEL((bdsqr_init), gridBasic, threadsBasic, 0, stream, n, D, strideD, E, strideE, info, maxiter, sfm, tol, splits_map, work, strideW, completed); - bool const use_bdsqr_host = true; - if(use_bdsqr_host) + if(alg_mode == rocsolver_alg_mode_hybrid) { - ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template( - handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U, - shiftU, ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work)); + ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template( + handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U, shiftU, + ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work)); } else { - // rotate to upper bidiagonal if necessary - if(uplo == rocblas_fill_lower) + if(n > 1) { - ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper), gridBasic, threadsUC, 0, stream, n, nu, - nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C, shiftC, - ldc, strideC, info, work, strideW, completed); - } + // rotate to upper bidiagonal if necessary + if(uplo == rocblas_fill_lower) + { + ROCSOLVER_LAUNCH_KERNEL((bdsqr_lower2upper), gridBasic, threadsUC, 0, stream, n, + nu, nc, D, strideD, E, strideE, U, shiftU, ldu, strideU, C, + shiftC, ldc, strideC, info, work, strideW, completed); + } - rocblas_int h_iter = 0; - struct - { - rocblas_int completed; - rocblas_int num_splits; - } h_params; + rocblas_int h_iter = 0; + struct + { + rocblas_int completed; + rocblas_int num_splits; + } h_params; - while(h_iter < maxiter) - { - // if all instances in the batch have finished, exit the loop - HIP_CHECK(hipMemcpyAsync(&h_params, completed, sizeof(h_params), hipMemcpyDeviceToHost, - stream)); - HIP_CHECK(hipStreamSynchronize(stream)); + while(h_iter < maxiter) + { + // if all instances in the batch have finished, exit the loop + HIP_CHECK(hipMemcpyAsync(&h_params, completed, sizeof(h_params), + hipMemcpyDeviceToHost, stream)); + HIP_CHECK(hipStreamSynchronize(stream)); - if(h_params.completed == batch_count) - break; + if(h_params.completed == batch_count) + break; - dim3 gridSplits(1, h_params.num_splits, batch_count); - dim3 gridVUC((nvuc_max - 1) / BS1 + 1, h_params.num_splits, batch_count); + dim3 gridSplits(1, h_params.num_splits, batch_count); + dim3 gridVUC((nvuc_max - 1) / BS1 + 1, h_params.num_splits, batch_count); - for(rocblas_int inner_iters = 0; inner_iters < BDSQR_ITERS_PER_SYNC; inner_iters++) - { - if(nvuc_max <= BDSQR_SWITCH_SIZE) + for(rocblas_int inner_iters = 0; inner_iters < BDSQR_ITERS_PER_SYNC; inner_iters++) { - // main computation of SVD - ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute), gridSplits, threadsBS1, 0, - stream, n, nv, nu, nc, D, strideD, E, strideE, V, - shiftV, ldv, strideV, U, shiftU, ldu, strideU, C, - shiftC, ldc, strideC, maxiter, eps, sfm, tol, minshift, - splits_map, work, incW, strideW, completed); - } - else - { - // main computation of SVD - ROCSOLVER_LAUNCH_KERNEL( - (bdsqr_compute), gridSplits, threadsBS1, 0, stream, n, nv, nu, nc, - D, strideD, E, strideE, (W1) nullptr, shiftV, ldv, strideV, (W2) nullptr, - shiftU, ldu, strideU, (W3) nullptr, shiftC, ldc, strideC, maxiter, eps, sfm, - tol, minshift, splits_map, work, incW, strideW, completed); - - // update singular vectors - ROCSOLVER_LAUNCH_KERNEL((bdsqr_rotate), gridVUC, threadsVUC, 0, stream, n, - nv, nu, nc, V, shiftV, ldv, strideV, U, shiftU, ldu, - strideU, C, shiftC, ldc, strideC, maxiter, splits_map, - work, incW, strideW, completed); + if(nvuc_max <= BDSQR_SWITCH_SIZE) + { + // main computation of SVD + ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute), gridSplits, threadsBS1, 0, + stream, n, nv, nu, nc, D, strideD, E, strideE, V, + shiftV, ldv, strideV, U, shiftU, ldu, strideU, C, + shiftC, ldc, strideC, maxiter, eps, sfm, tol, + minshift, splits_map, work, incW, strideW, completed); + } + else + { + // main computation of SVD + ROCSOLVER_LAUNCH_KERNEL((bdsqr_compute), gridSplits, threadsBS1, 0, + stream, n, nv, nu, nc, D, strideD, E, strideE, + (W1) nullptr, shiftV, ldv, strideV, (W2) nullptr, + shiftU, ldu, strideU, (W3) nullptr, shiftC, ldc, + strideC, maxiter, eps, sfm, tol, minshift, + splits_map, work, incW, strideW, completed); + + // update singular vectors + ROCSOLVER_LAUNCH_KERNEL((bdsqr_rotate), gridVUC, threadsVUC, 0, stream, + n, nv, nu, nc, V, shiftV, ldv, strideV, U, shiftU, + ldu, strideU, C, shiftC, ldc, strideC, maxiter, + splits_map, work, incW, strideW, completed); + } + + // update split block endpoints + ROCSOLVER_LAUNCH_KERNEL((bdsqr_update_endpoints), gridSplits, threadsBasic, + 0, stream, n, E, strideE, splits_map, work, strideW, + completed); } - // update split block endpoints - ROCSOLVER_LAUNCH_KERNEL((bdsqr_update_endpoints), gridSplits, threadsBasic, 0, - stream, n, E, strideE, splits_map, work, strideW, completed); + // check for completion + h_iter += BDSQR_ITERS_PER_SYNC; + ROCSOLVER_LAUNCH_KERNEL((bdsqr_chk_completed), gridBasic, threadsBasic, 0, + stream, n, maxiter, splits_map, work, strideW, completed); } - - // check for completion - h_iter += BDSQR_ITERS_PER_SYNC; - ROCSOLVER_LAUNCH_KERNEL((bdsqr_chk_completed), gridBasic, threadsBasic, 0, stream, n, - maxiter, splits_map, work, strideW, completed); } } diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp similarity index 83% rename from library/src/auxiliary/rocauxiliary_bdsqr_host.hpp rename to library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp index 3bf801519..d304c8f61 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_host.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp @@ -734,306 +734,6 @@ static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hip n, x, incx, y, incy); } -extern "C" { - -double dlamch_(char* cmach); -float slamch_(char* cmach); - -void zswap_(int* n, std::complex* zx, int* incx, std::complex* zy, int* incy); - -void cswap_(int* n, std::complex* zx, int* incx, std::complex* zy, int* incy); - -void dswap_(int* n, double* zx, int* incx, double* zy, int* incy); - -void sswap_(int* n, float* zx, int* incx, float* zy, int* incy); - -void dlasq1_(int* n, double* D_, double* E_, double* rwork_, int* info_arg); -void slasq1_(int* n, float* D_, float* E_, float* rwork_, int* info_arg); - -void zlasr_(char* side, - char* pivot, - char* direct, - int* m, - int* n, - double* c, - double* s, - std::complex* A, - int* lda); -void clasr_(char* side, - char* pivot, - char* direct, - int* m, - int* n, - float* c, - float* s, - std::complex* A, - int* lda); -void slasr_(char* side, char* pivot, char* direct, int* m, int* n, float* c, float* s, float* A, int* lda); -void dlasr_(char* side, char* pivot, char* direct, int* m, int* n, double* c, double* s, double* A, int* lda); - -void dlasv2_(double* f, - double* g, - double* h, - double* ssmin, - double* ssmax, - double* snr, - double* csr, - double* snl, - double* csl); -void slasv2_(float* f, - float* g, - float* h, - float* ssmin, - float* ssmax, - float* snr, - float* csr, - float* snl, - float* csl); - -void zdrot_(int* n, - std::complex* zx, - int* incx, - std::complex* zy, - int* incy, - double* c, - double* s); - -void csrot_(int* n, - std::complex* zx, - int* incx, - std::complex* zy, - int* incy, - float* c, - float* s); - -void drot_(int* n, double* dx, int* incx, double* dy, int* incy, double* c, double* s); - -void srot_(int* n, float* dx, int* incx, float* dy, int* incy, float* c, float* s); - -void zdscal_(int* n, double* da, std::complex* zx, int* incx); -void csscal_(int* n, float* da, std::complex* zx, int* incx); -void zscal_(int* n, std::complex* za, std::complex* zx, int* incx); -void cscal_(int* n, std::complex* za, std::complex* zx, int* incx); -void dscal_(int* n, double* da, double* zx, int* incx); -void sscal_(int* n, float* da, float* zx, int* incx); - -void dlartg_(double* f, double* g, double* c, double* s, double* r); -void slartg_(float* f, float* g, float* c, float* s, float* r); - -void zlartg_(std::complex* f, - std::complex* g, - double* c, - std::complex* s, - std::complex* r); -void clartg_(std::complex* f, - std::complex* g, - float* c, - std::complex* s, - std::complex* r); - -void dlas2_(double* f, double* g, double* h, double* ssmin, double* ssmax); -void slas2_(float* f, float* g, float* h, float* ssmin, float* ssmax); -}; - -extern "C" { - -void cbdsqr_(char* uplo, - int* n, - int* ncvt, - int* nru, - int* ncc, - float* d, - float* e, - std::complex* vt, - int* ldvt, - std::complex* u, - int* ldu, - std::complex* c, - int* ldc, - float* rwork, - int* info); - -void zbdsqr_(char* uplo, - int* n, - int* ncvt, - int* nru, - int* ncc, - double* d, - double* e, - std::complex* vt, - int* ldvt, - std::complex* u, - int* ldu, - std::complex* c, - int* ldc, - double* rwork, - int* info); - -void sbdsqr_(char* uplo, - int* n, - int* ncvt, - int* nru, - int* ncc, - float* d, - float* e, - float* vt, - int* ldvt, - float* u, - int* ldu, - float* c, - int* ldc, - float* rwork, - int* info); - -void dbdsqr_(char* uplo, - int* n, - int* ncvt, - int* nru, - int* ncc, - double* d, - double* e, - double* vt, - int* ldvt, - double* u, - int* ldu, - double* c, - int* ldc, - double* rwork, - int* info); -}; - -static void call_bdsqr(char& uplo, - int& n, - int& ncvt, - int& nru, - int& ncc, - double& d, - double& e, - std::complex& vt, - int& ldvt, - std::complex& u, - int& ldu, - std::complex& c, - int& ldc, - double& rwork, - int& info) -{ - zbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex*)&vt, &ldvt, - (std::complex*)&u, &ldu, (std::complex*)&c, &ldc, &rwork, &info); -} - -static void call_bdsqr(char& uplo, - int& n, - int& ncvt, - int& nru, - int& ncc, - double& d, - double& e, - rocblas_complex_num& vt, - int& ldvt, - rocblas_complex_num& u, - int& ldu, - rocblas_complex_num& c, - int& ldc, - double& rwork, - int& info) -{ - zbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex*)&vt, &ldvt, - (std::complex*)&u, &ldu, (std::complex*)&c, &ldc, &rwork, &info); -} - -static void call_bdsqr(char& uplo, - int& n, - int& ncvt, - int& nru, - int& ncc, - float& d, - float& e, - std::complex& vt, - int& ldvt, - std::complex& u, - int& ldu, - std::complex& c, - int& ldc, - float& rwork, - int& info) -{ - cbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex*)&vt, &ldvt, - (std::complex*)&u, &ldu, (std::complex*)&c, &ldc, &rwork, &info); -} - -static void call_bdsqr(char& uplo, - int& n, - int& ncvt, - int& nru, - int& ncc, - float& d, - float& e, - float& vt, - int& ldvt, - float& u, - int& ldu, - float& c, - int& ldc, - float& rwork, - int& info) -{ - sbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info); -} - -static void call_bdsqr(char& uplo, - int& n, - int& ncvt, - int& nru, - int& ncc, - float& d, - float& e, - rocblas_complex_num& vt, - int& ldvt, - rocblas_complex_num& u, - int& ldu, - rocblas_complex_num& c, - int& ldc, - float& rwork, - int& info) -{ - cbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, (std::complex*)&vt, &ldvt, - (std::complex*)&u, &ldu, (std::complex*)&c, &ldc, &rwork, &info); -} - -static void call_bdsqr(char& uplo, - int& n, - int& ncvt, - int& nru, - int& ncc, - double& d, - double& e, - double& vt, - int& ldvt, - double& u, - int& ldu, - double& c, - int& ldc, - double& rwork, - int& info) -{ - dbdsqr_(&uplo, &n, &ncvt, &nru, &ncc, &d, &e, &vt, &ldvt, &u, &ldu, &c, &ldc, &rwork, &info); -} - -#ifdef USE_LAPACK -static void call_lamch(char& cmach_arg, double& eps) -{ - char cmach = cmach_arg; - eps = dlamch_(&cmach); -} - -static void call_lamch(char& cmach_arg, float& eps) -{ - char cmach = cmach_arg; - eps = slamch_(&cmach); -} -#else - static void call_lamch(char& cmach, double& eps) { eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() @@ -1050,48 +750,6 @@ static void call_lamch(char& cmach, float& eps) : std::numeric_limits::min(); } -#endif - -#ifdef USE_LAPACK -static void call_swap(int& n, - rocblas_complex_num& zx, - int& incx, - rocblas_complex_num& zy, - int& incy) -{ - cswap_(&n, (std::complex*)&zx, &incx, (std::complex*)&zy, &incy); -} - -static void call_swap(int& n, std::complex& zx, int& incx, std::complex& zy, int& incy) -{ - cswap_(&n, &zx, &incx, &zy, &incy); -} - -static void call_swap(int& n, - rocblas_complex_num& zx, - int& incx, - rocblas_complex_num& zy, - int& incy) -{ - zswap_(&n, (std::complex*)&zx, &incx, (std::complex*)&zy, &incy); -} - -static void call_swap(int& n, std::complex& zx, int& incx, std::complex& zy, int& incy) -{ - zswap_(&n, &zx, &incx, &zy, &incy); -} - -static void call_swap(int& n, float& zx, int& incx, float& zy, int& incy) -{ - sswap_(&n, &zx, &incx, &zy, &incy); -} - -static void call_swap(int& n, double& zx, int& incx, double& zy, int& incy) -{ - dswap_(&n, &zx, &incx, &zy, &incy); -} -#else - template static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy) { @@ -1108,20 +766,6 @@ static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy) } } -#endif - -#ifdef USE_LAPACK -static void call_las2(double& f, double& g, double& h, double& ssmin, double& ssmax) -{ - dlas2_(&f, &g, &h, &ssmin, &ssmax); -} - -static void call_las2(float& f, float& g, float& h, float& ssmin, float& ssmax) -{ - slas2_(&f, &g, &h, &ssmin, &ssmax); -} -#else - template static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax) { @@ -1193,39 +837,6 @@ static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax) } } -#endif - -#ifdef USE_LAPACK - -static void call_lartg(double& f, double& g, double& c, double& s, double& r) -{ - dlartg_(&f, &g, &c, &s, &r); -} - -static void call_lartg(float& f, float& g, float& c, float& s, float& r) -{ - slartg_(&f, &g, &c, &s, &r); -} - -static void call_lartg(std::complex& f, - std::complex& g, - float& c, - std::complex& s, - std::complex& r) -{ - clartg_(&f, &g, &c, &s, &r); -} - -static void call_lartg(std::complex& f, - std::complex& g, - double& c, - std::complex& s, - std::complex& r) -{ - zlartg_(&f, &g, &c, &s, &r); -} -#else - static float real_part(float z) { return (z); @@ -1503,60 +1114,6 @@ static void call_lartg(T& f, T& g, S& cs, T& sn, T& r) } } -#endif - -#ifdef USE_LAPACK -static void call_scal(int& n, rocblas_complex_num& da, rocblas_complex_num& zx, int& incx) -{ - cscal_(&n, (std::complex*)&da, (std::complex*)&zx, &incx); -} - -static void call_scal(int& n, std::complex& da, std::complex& zx, int& incx) -{ - cscal_(&n, &da, &zx, &incx); -} - -static void - call_scal(int& n, rocblas_complex_num& da, rocblas_complex_num& zx, int& incx) -{ - zscal_(&n, (std::complex*)&da, (std::complex*)&zx, &incx); -} - -static void call_scal(int& n, std::complex& da, std::complex& zx, int& incx) -{ - zscal_(&n, &da, &zx, &incx); -} - -static void call_scal(int& n, double& da, rocblas_complex_num& zx, int& incx) -{ - zdscal_(&n, &da, (std::complex*)&zx, &incx); -} - -static void call_scal(int& n, double& da, std::complex& zx, int& incx) -{ - zdscal_(&n, &da, &zx, &incx); -} - -static void call_scal(int& n, float& da, rocblas_complex_num& zx, int& incx) -{ - csscal_(&n, &da, (std::complex*)&zx, &incx); -} - -static void call_scal(int& n, float& da, std::complex& zx, int& incx) -{ - csscal_(&n, &da, &zx, &incx); -} - -static void call_scal(int& n, double& da, double& zx, int& incx) -{ - dscal_(&n, &da, &zx, &incx); -} - -static void call_scal(int& n, float& da, float& zx, int& incx) -{ - sscal_(&n, &da, &zx, &incx); -} -#else template static void call_scal(I& n, S& a, T& x_in, I& incx) { @@ -1576,64 +1133,6 @@ static void call_scal(I& n, S& a, T& x_in, I& incx) }; } -#endif - -#ifdef USE_LAPACK -static void call_rot(int& n, - std::complex& zx, - int& incx, - std::complex& zy, - int& incy, - float& c, - float& s) -{ - csrot_(&n, &zx, &incx, &zy, &incy, &c, &s); -} - -static void call_rot(int& n, - rocblas_complex_num& zx, - int& incx, - rocblas_complex_num& zy, - int& incy, - float& c, - float& s) -{ - csrot_(&n, (std::complex*)&zx, &incx, (std::complex*)&zy, &incy, &c, &s); -} - -static void call_rot(int& n, - std::complex& zx, - int& incx, - std::complex& zy, - int& incy, - double& c, - double& s) -{ - zdrot_(&n, &zx, &incx, &zy, &incy, &c, &s); -} - -static void call_rot(int& n, - rocblas_complex_num& zx, - int& incx, - rocblas_complex_num& zy, - int& incy, - double& c, - double& s) -{ - zdrot_(&n, (std::complex*)&zx, &incx, (std::complex*)&zy, &incy, &c, &s); -} - -static void call_rot(int& n, double& dx, int& incx, double& dy, int& incy, double& c, double& s) -{ - drot_(&n, &dx, &incx, &dy, &incy, &c, &s); -} - -static void call_rot(int& n, float& dx, int& incx, float& dy, int& incy, float& c, float& s) -{ - srot_(&n, &dx, &incx, &dy, &incy, &c, &s); -} -#else - template static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s) { @@ -1651,35 +1150,6 @@ static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s) } } -#endif - -#ifdef USE_LAPACK -static void call_lasv2(double& f, - double& g, - double& h, - double& ssmin, - double& ssmax, - double& snr, - double& csr, - double& snl, - double& csl) -{ - dlasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl); -} - -static void call_lasv2(float& f, - float& g, - float& h, - float& ssmin, - float& ssmax, - float& snr, - float& csr, - float& snl, - float& csl) -{ - slasv2_(&f, &g, &h, &ssmin, &ssmax, &snr, &csr, &snl, &csl); -} -#else // -------------------------------------------------------- // lasv2 computes the singular value decomposition of a 2 x 2 // triangular matrix @@ -1899,90 +1369,6 @@ static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h)); } -#endif - -static void call_lasq1(int& n, double& D_, double& E_, double& rwork_, int& info_arg) -{ - dlasq1_(&n, &D_, &E_, &rwork_, &info_arg); -}; - -static void call_lasq1(int& n, float& D_, float& E_, float& rwork_, int& info_arg) -{ - slasq1_(&n, &D_, &E_, &rwork_, &info_arg); -}; - -#ifdef USE_LAPACK -static void call_lasr(char& side, - char& pivot, - char& direct, - int& m, - int& n, - float& c, - float& s, - rocblas_complex_num& A, - int& lda) -{ - clasr_(&side, &pivot, &direct, &m, &n, &c, &s, (std::complex*)&A, &lda); -}; - -static void call_lasr(char& side, - char& pivot, - char& direct, - int& m, - int& n, - double& c, - double& s, - rocblas_complex_num& A, - int& lda) -{ - zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, (std::complex*)&A, &lda); -}; - -static void call_lasr(char& side, - char& pivot, - char& direct, - int& m, - int& n, - double& c, - double& s, - std::complex& A, - int& lda) -{ - zlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); -}; - -static void call_lasr(char& side, - char& pivot, - char& direct, - int& m, - int& n, - float& c, - float& s, - std::complex& A, - int& lda) -{ - clasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); -}; - -static void - call_lasr(char& side, char& pivot, char& direct, int& m, int& n, float& c, float& s, float& A, int& lda) -{ - slasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); -}; - -static void call_lasr(char& side, - char& pivot, - char& direct, - int& m, - int& n, - double& c, - double& s, - double& A, - int& lda) -{ - dlasr_(&side, &pivot, &direct, &m, &n, &c, &s, &A, &lda); -}; -#else template static void call_lasr(char& side, char& pivot, char& direct, I& m, I& n, S& c, S& s, T& A, I& lda) { @@ -1992,8 +1378,6 @@ static void call_lasr(char& side, char& pivot, char& direct, I& m, I& n, S& c, S lasr_body(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc); }; -#endif - template static void bdsqr_single_template(char uplo, I n, @@ -2025,13 +1409,6 @@ static void bdsqr_single_template(char uplo, // ----------------------------------- bool constexpr need_sort = false; - // --------------------------------------------------- - // NOTE: lasq1 may return non-zero info value that - // has a different meaning - // Consider turning off lasq1 to have consistent info value - // --------------------------------------------------- - bool constexpr use_lasq1 = false; - S const zero = 0; S const one = 1; S negone = -1; @@ -2194,19 +1571,6 @@ static void bdsqr_single_template(char uplo, if(n == 1) goto L160; - /* - * if no singular vectors desired, use qd algorithm - */ - if((!rotate) && (use_lasq1)) - { - call_lasq1(n, d(1), e(1), work(1), info); - /* - * if info equals 2, dqds didn't finish, try to finish - */ - if(info != 2) - return; - info = 0; - } nm1 = n - 1; nm12 = nm1 + nm1; @@ -3680,40 +3044,14 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, I nrv = n; I ncvt = nv; bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0); - bool const use_lapack_bdsqr = false; - - if((use_lapack_bdsqr) && (values_only)) - { - // -------------------------------- - // call the lapack version of bdsqr - // -------------------------------- - auto ln = n; - auto lncvt = ncvt; - auto lnru = nru; - auto lncc = ncc; - S& d_arg = d_[0]; - S& e_arg = e_[0]; - T& vt_arg = vt_[0]; - T& u_arg = u_[0]; - T& c_arg = c_[0]; - S& work_arg = work_[0]; - auto ldvt_arg = ldvt; - auto ldu_arg = ldu; - auto ldc_arg = ldc; - - call_bdsqr(uplo, ln, lncvt, lnru, lncc, d_arg, e_arg, vt_arg, ldvt_arg, u_arg, ldu_arg, - c_arg, ldu_arg, work_arg, info); - } - else - { - bdsqr_single_template(uplo, n, ncvt, nru, ncc, - d_, e_, + bdsqr_single_template(uplo, n, ncvt, nru, ncc, - vt_, ldvt, u_, ldu, c_, ldc, + d_, e_, - work_, info, dwork_, stream); - } + vt_, ldvt, u_, ldu, c_, ldc, + + work_, info, dwork_, stream); if(info == 0) { diff --git a/library/src/common/rocsolver_handle.cpp b/library/src/common/rocsolver_handle.cpp new file mode 100644 index 000000000..949d60fd2 --- /dev/null +++ b/library/src/common/rocsolver_handle.cpp @@ -0,0 +1,132 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#include "rocsolver_handle.hpp" +#include "rocblas.hpp" + +#include + +ROCSOLVER_BEGIN_NAMESPACE + +rocblas_status rocsolver_set_alg_mode_impl(rocblas_handle handle, + const rocsolver_function func, + const rocsolver_alg_mode mode) +{ + if(!handle) + return rocblas_status_invalid_handle; + + std::shared_ptr handle_ptr; + ROCBLAS_CHECK(rocblas_internal_get_data_ptr(handle, handle_ptr)); + rocsolver_handle_data handle_data = (rocsolver_handle_data)handle_ptr.get(); + + if(handle_data == nullptr) + { + handle_ptr = std::make_shared(); + handle_data = (rocsolver_handle_data)handle_ptr.get(); + handle_data->checksum = sizeof(rocsolver_handle_data_); + + ROCBLAS_CHECK(rocblas_internal_set_data_ptr(handle, handle_ptr)); + } + else + { + if(handle_data->checksum != sizeof(rocsolver_handle_data_)) + return rocblas_status_internal_error; + } + + switch(func) + { + case rocsolver_function_gesvd: + case rocsolver_function_bdsqr: + if(mode == rocsolver_alg_mode_gpu || mode == rocsolver_alg_mode_hybrid) + { + handle_data->bdsqr_mode = mode; + return rocblas_status_success; + } + } + + return rocblas_status_invalid_value; +} + +rocblas_status rocsolver_get_alg_mode_impl(rocblas_handle handle, + const rocsolver_function func, + rocsolver_alg_mode* mode) +{ + if(!handle) + return rocblas_status_invalid_handle; + + std::shared_ptr handle_ptr; + ROCBLAS_CHECK(rocblas_internal_get_data_ptr(handle, handle_ptr)); + rocsolver_handle_data handle_data = (rocsolver_handle_data)handle_ptr.get(); + + if(handle_data == nullptr) + { + *mode = rocsolver_alg_mode_gpu; + } + else + { + if(handle_data->checksum != sizeof(rocsolver_handle_data_)) + return rocblas_status_internal_error; + + switch(func) + { + case rocsolver_function_gesvd: + case rocsolver_function_bdsqr: *mode = handle_data->bdsqr_mode; break; + default: return rocblas_status_invalid_value; + } + } + + return rocblas_status_success; +} + +ROCSOLVER_END_NAMESPACE + +extern "C" { + +rocblas_status rocsolver_set_alg_mode(rocblas_handle handle, + const rocsolver_function func, + const rocsolver_alg_mode mode) +try +{ + return rocsolver::rocsolver_set_alg_mode_impl(handle, func, mode); +} +catch(...) +{ + return rocsolver::exception_to_rocblas_status(); +} + +rocblas_status rocsolver_get_alg_mode(rocblas_handle handle, + const rocsolver_function func, + rocsolver_alg_mode* mode) +try +{ + return rocsolver::rocsolver_get_alg_mode_impl(handle, func, mode); +} +catch(...) +{ + return rocsolver::exception_to_rocblas_status(); +} +} diff --git a/library/src/include/rocsolver_handle.hpp b/library/src/include/rocsolver_handle.hpp new file mode 100644 index 000000000..527bac90c --- /dev/null +++ b/library/src/include/rocsolver_handle.hpp @@ -0,0 +1,44 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "common_host_helpers.hpp" +#include "rocsolver/rocsolver.h" + +ROCSOLVER_BEGIN_NAMESPACE + +struct rocsolver_handle_data_ +{ + rocblas_int checksum; + + rocsolver_alg_mode bdsqr_mode; +}; + +typedef struct rocsolver_handle_data_* rocsolver_handle_data; + +ROCSOLVER_END_NAMESPACE From ccec4f920875e9d24ec45a66e44c92e1fdb6a26e Mon Sep 17 00:00:00 2001 From: Troy Alderson <58866654+tfalders@users.noreply.github.com> Date: Thu, 7 Nov 2024 13:07:30 -0700 Subject: [PATCH 25/35] Addressed review comment --- library/src/include/rocsolver_handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/src/include/rocsolver_handle.hpp b/library/src/include/rocsolver_handle.hpp index 527bac90c..2e4177556 100644 --- a/library/src/include/rocsolver_handle.hpp +++ b/library/src/include/rocsolver_handle.hpp @@ -36,7 +36,7 @@ struct rocsolver_handle_data_ { rocblas_int checksum; - rocsolver_alg_mode bdsqr_mode; + rocsolver_alg_mode bdsqr_mode = rocsolver_alg_mode_gpu; }; typedef struct rocsolver_handle_data_* rocsolver_handle_data; From 39a9d83dd5c0041e0988df0f7cb1b83f7d4790dd Mon Sep 17 00:00:00 2001 From: Troy Alderson <58866654+tfalders@users.noreply.github.com> Date: Thu, 7 Nov 2024 13:27:16 -0700 Subject: [PATCH 26/35] Minor test improvement --- clients/common/auxiliary/testing_bdsqr.hpp | 8 ++++++++ clients/common/lapack/testing_gesvd.hpp | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/clients/common/auxiliary/testing_bdsqr.hpp b/clients/common/auxiliary/testing_bdsqr.hpp index 5946bb93a..058a923b4 100644 --- a/clients/common/auxiliary/testing_bdsqr.hpp +++ b/clients/common/auxiliary/testing_bdsqr.hpp @@ -479,10 +479,18 @@ void testing_bdsqr(Arguments& argus) rocblas_int hot_calls = argus.iters; if(argus.alg_mode) + { EXPECT_ROCBLAS_STATUS( rocsolver_set_alg_mode(handle, rocsolver_function_bdsqr, rocsolver_alg_mode_hybrid), rocblas_status_success); + rocsolver_alg_mode alg_mode; + EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_bdsqr, &alg_mode), + rocblas_status_success); + + EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); + } + // check non-supported values if(uplo != rocblas_fill_upper && uplo != rocblas_fill_lower) { diff --git a/clients/common/lapack/testing_gesvd.hpp b/clients/common/lapack/testing_gesvd.hpp index 8f5745826..de7fa10d3 100644 --- a/clients/common/lapack/testing_gesvd.hpp +++ b/clients/common/lapack/testing_gesvd.hpp @@ -549,10 +549,18 @@ void testing_gesvd(Arguments& argus) rocblas_int hot_calls = argus.iters; if(argus.alg_mode) + { EXPECT_ROCBLAS_STATUS( rocsolver_set_alg_mode(handle, rocsolver_function_gesvd, rocsolver_alg_mode_hybrid), rocblas_status_success); + rocsolver_alg_mode alg_mode; + EXPECT_ROCBLAS_STATUS(rocsolver_get_alg_mode(handle, rocsolver_function_gesvd, &alg_mode), + rocblas_status_success); + + EXPECT_EQ(alg_mode, rocsolver_alg_mode_hybrid); + } + // check non-supported values if(rightv == rocblas_svect_overwrite && leftv == rocblas_svect_overwrite) { From c05e1e42e5d0d4674c855ec7e25080962c599d80 Mon Sep 17 00:00:00 2001 From: Troy Alderson <58866654+tfalders@users.noreply.github.com> Date: Thu, 7 Nov 2024 14:12:07 -0700 Subject: [PATCH 27/35] Use new lasr template --- .../auxiliary/rocauxiliary_bdsqr_hybrid.hpp | 696 +++--------------- 1 file changed, 82 insertions(+), 614 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp index d304c8f61..797d40140 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp @@ -39,6 +39,8 @@ #include #include +#include "rocauxiliary_lasr.hpp" + #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" @@ -52,547 +54,6 @@ ROCSOLVER_BEGIN_NAMESPACE } #endif -#ifndef LASR_MAX_NTHREADS -#define LASR_MAX_NTHREADS 64 -#endif - -template -__host__ __device__ static void lasr_body(char const side, - char const pivot, - char const direct, - I const m, - I const n, - S const* const __restrict__ c_, - S const* const __restrict__ s_, - T* const __restrict__ A_, - I const lda, - I const tid, - I const i_inc) -{ - auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; - auto min = [](auto x, auto y) { return ((x < y) ? x : y); }; - - auto indx2f = [](auto i, auto j, auto lda) { - assert((1 <= i)); - assert((1 <= lda)); - assert((1 <= j)); - - return (i + j * lda - (1 + lda)); - }; - - auto indx1f = [](auto i) -> int64_t { - assert((1 <= i)); - return (i - (1)); - }; - - auto c = [&](auto i) -> const S { return (c_[(i)-1]); }; - auto s = [&](auto i) -> const S { return (s_[(i)-1]); }; - auto A = [&](auto i, auto j) -> T& { return (A_[indx2f(i, j, lda)]); }; - - const S one = 1; - const S zero = 0; - - constexpr bool use_reorder = true; - - // ---------------- - // check arguments - // ---------------- - - const bool is_side_Left = (side == 'L') || (side == 'l'); - const bool is_side_Right = (side == 'R') || (side == 'r'); - - const bool is_pivot_Variable = (pivot == 'V') || (pivot == 'v'); - const bool is_pivot_Bottom = (pivot == 'B') || (pivot == 'b'); - const bool is_pivot_Top = (pivot == 'T') || (pivot == 't'); - - const bool is_direct_Forward = (direct == 'F') || (direct == 'f'); - const bool is_direct_Backward = (direct == 'B') || (direct == 'b'); - - { - const bool isok_side = is_side_Left || is_side_Right; - const bool isok_pivot = is_pivot_Variable || is_pivot_Bottom || is_pivot_Top; - const bool isok_direct = is_direct_Forward || is_direct_Backward; - - const I info = (!isok_side) ? 1 - : (!isok_pivot) ? 2 - : (!isok_direct) ? 3 - : (m < 0) ? 4 - : (n < 0) ? 5 - : (c_ == nullptr) ? 6 - : (s_ == nullptr) ? 7 - : (A_ == nullptr) ? 8 - : (lda < max(1, m)) ? 9 - : 0; - if(info != 0) - return; - }; - - { - const bool has_work = (m >= 1) && (n >= 1); - if(!has_work) - { - return; - }; - }; - - if(is_side_Left && is_pivot_Variable && is_direct_Forward) - { - // ----------------------------- - // A := P*A - // Variable pivot, the plane (k,k+1) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- - { - if constexpr(use_reorder) - { - for(I i = 1 + tid; i <= n; i += i_inc) - { - for(I j = 1; j <= (m - 1); j++) - { - const auto ctemp = c(j); - const auto stemp = s(j); - const auto temp = A(j + 1, i); - A(j + 1, i) = ctemp * temp - stemp * A(j, i); - A(j, i) = stemp * temp + ctemp * A(j, i); - } - } - } - else - { - for(I j = 1; j <= (m - 1); j++) - { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) - { - for(I i = 1 + tid; i <= n; i += i_inc) - { - const auto temp = A(j + 1, i); - A(j + 1, i) = ctemp * temp - stemp * A(j, i); - A(j, i) = stemp * temp + ctemp * A(j, i); - } - } - } - } - } - - return; - }; - - if(is_side_Left && is_pivot_Variable && is_direct_Backward) - { - // ----------------------------- - // A := P*A - // Variable pivot, the plane (k,k+1) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- - - auto const jend = (m - 1); - auto const jstart = 1; - auto const istart = 1; - auto const iend = n; - - if constexpr(use_reorder) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - for(I j = jend; j >= jstart; j--) - { - const auto ctemp = c(j); - const auto stemp = s(j); - const auto temp = A(j + 1, i); - A(j + 1, i) = ctemp * temp - stemp * A(j, i); - A(j, i) = stemp * temp + ctemp * A(j, i); - } - } - } - else - { - for(I j = jend; j >= jstart; j--) - { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(j + 1, i); - A(j + 1, i) = ctemp * temp - stemp * A(j, i); - A(j, i) = stemp * temp + ctemp * A(j, i); - } - } - } - } - - return; - }; - - if(is_side_Left && is_pivot_Top && is_direct_Forward) - { - // ----------------------------- - // A := P*A - // Top pivot, the plane (1,k+1) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- - { - for(I j = 2; j <= m; j++) - { - const auto ctemp = c(j - 1); - const auto stemp = s(j - 1); - for(I i = 1 + tid; i <= n; i += i_inc) - { - const auto temp = A(j, i); - A(j, i) = ctemp * temp - stemp * A(1, i); - A(1, i) = stemp * temp + ctemp * A(1, i); - }; - }; - }; - - return; - }; - - if(is_side_Left && is_pivot_Top && is_direct_Backward) - { - // ----------------------------- - // A := P*A - // Top pivot, the plane (1,k+1) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- - { - auto const jend = m; - auto const jstart = 2; - auto const istart = 1; - auto const iend = n; - - for(I j = jend; j >= jstart; j--) - { - const auto ctemp = c(j - 1); - const auto stemp = s(j - 1); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(j, i); - - A(j, i) = ctemp * temp - stemp * A(1, i); - A(1, i) = stemp * temp + ctemp * A(1, i); - }; - }; - }; - } - - return; - }; - - if(is_side_Left && is_pivot_Bottom && is_direct_Forward) - { - // ----------------------------- - // A := P*A - // Bottom pivot, the plane (k,z) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- - { - auto const jstart = 1; - auto const jend = (m - 1); - auto const istart = 1; - auto const iend = n; - - for(I j = jstart; j <= jend; j++) - { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(j, i); - A(j, i) = stemp * A(m, i) + ctemp * temp; - A(m, i) = ctemp * A(m, i) - stemp * temp; - }; - }; - }; - } - - return; - }; - - if(is_side_Left && is_pivot_Bottom && is_direct_Backward) - { - // ----------------------------- - // A := P*A - // Bottom pivot, the plane (k,z) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- - { - auto const jend = (m - 1); - auto const jstart = 1; - auto const istart = 1; - auto const iend = n; - - for(I j = jend; j >= jstart; j--) - { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(j, i); - A(j, i) = stemp * A(m, i) + ctemp * temp; - A(m, i) = ctemp * A(m, i) - stemp * temp; - }; - }; - }; - } - - return; - }; - - if(is_side_Right && is_pivot_Variable && is_direct_Forward) - { - // ----------------------------- - // A := A*P**T - // Variable pivot, the plane (k,k+1) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- - - { - auto const jstart = 1; - auto const jend = (n - 1); - auto const istart = 1; - auto const iend = m; - - for(I j = jstart; j <= jend; j++) - { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(i, j + 1); - A(i, j + 1) = ctemp * temp - stemp * A(i, j); - A(i, j) = stemp * temp + ctemp * A(i, j); - }; - }; - }; - } - - return; - }; - - if(is_side_Right && is_pivot_Variable && is_direct_Backward) - { - // ----------------------------- - // A := A*P**T - // Variable pivot, the plane (k,k+1) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- - - { - auto const jend = (n - 1); - auto const jstart = 1; - auto const istart = 1; - auto const iend = m; - - for(I j = jend; j >= jstart; j--) - { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(i, j + 1); - A(i, j + 1) = ctemp * temp - stemp * A(i, j); - A(i, j) = stemp * temp + ctemp * A(i, j); - }; - }; - }; - } - return; - }; - - if(is_side_Right && is_pivot_Top && is_direct_Forward) - { - // ----------------------------- - // A := A*P**T - // Top pivot, the plane (1,k+1) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- - - { - auto const jstart = 2; - auto const jend = n; - auto const istart = 1; - auto const iend = m; - - for(I j = jstart; j <= jend; j++) - { - const auto ctemp = c(j - 1); - const auto stemp = s(j - 1); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(i, j); - - A(i, j) = ctemp * temp - stemp * A(i, 1); - A(i, 1) = stemp * temp + ctemp * A(i, 1); - }; - }; - }; - } - - return; - }; - - if(is_side_Right && is_pivot_Top && is_direct_Backward) - { - // ----------------------------- - // A := A*P**T - // Top pivot, the plane (1,k+1) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- - - { - auto const jend = n; - auto const jstart = 2; - auto const istart = 1; - auto const iend = m; - - for(I j = jend; j >= jstart; j--) - { - const auto ctemp = c(j - 1); - const auto stemp = s(j - 1); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(i, j); - - A(i, j) = ctemp * temp - stemp * A(i, 1); - A(i, 1) = stemp * temp + ctemp * A(i, 1); - }; - }; - }; - } - - return; - }; - - if(is_side_Right && is_pivot_Bottom && is_direct_Forward) - { - // ----------------------------- - // A := A*P**T - // Bottom pivot, the plane (k,z) - // P = P(z-1) * ... * P(2) * P(1) - // ----------------------------- - - { - auto const jstart = 1; - auto const jend = (n - 1); - auto const istart = 1; - auto const iend = m; - - for(I j = jstart; j <= jend; j++) - { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(i, j); - - A(i, j) = stemp * A(i, n) + ctemp * temp; - A(i, n) = ctemp * A(i, n) - stemp * temp; - }; - }; - }; - } - - return; - }; - - if(is_side_Right && is_pivot_Bottom && is_direct_Backward) - { - // ----------------------------- - // A := A*P**T - // Bottom pivot, the plane (k,z) - // P = P(1)*P(2)*...*P(z-1) - // ----------------------------- - - { - auto const jend = (n - 1); - auto const jstart = 1; - auto const istart = 1; - auto const iend = m; - - for(I j = jend; j >= jstart; j--) - { - const auto ctemp = c(j); - const auto stemp = s(j); - if((ctemp != one) || (stemp != zero)) - { - for(I i = istart + tid; i <= iend; i += i_inc) - { - const auto temp = A(i, j); - A(i, j) = stemp * A(i, n) + ctemp * temp; - A(i, n) = ctemp * A(i, n) - stemp * temp; - }; - }; - }; - } - - return; - }; - - return; -} - -template -__global__ static void __launch_bounds__(LASR_MAX_NTHREADS) lasr_kernel(char const side, - char const pivot, - char const direct, - I const m, - I const n, - S const* const c_, - S const* const s_, - T* const A_, - I const lda) -{ - const auto nblocks = hipGridDim_x; - const auto nthreads_per_block = hipBlockDim_x; - const auto nthreads = nblocks * nthreads_per_block; - I const tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - I const i_inc = nthreads; - - lasr_body(side, pivot, direct, m, n, c_, s_, A_, lda, tid, i_inc); -} - -template -static void lasr_template_gpu(char const side, - char const pivot, - char const direct, - I const m, - I const n, - S const* const c_, - S const* const s_, - T* const A_, - I const lda, - hipStream_t stream = 0) -{ - auto const nthreads = LASR_MAX_NTHREADS; - - bool const is_left_side = (side == 'L') || (side == 'l'); - auto const mn = (is_left_side) ? n : m; - - auto const nblocks = (mn - 1) / nthreads + 1; - hipLaunchKernelGGL((lasr_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, - side, pivot, direct, m, n, c_, s_, A_, lda); -} - template __global__ static void rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s) @@ -1370,16 +831,25 @@ static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& } template -static void call_lasr(char& side, char& pivot, char& direct, I& m, I& n, S& c, S& s, T& A, I& lda) +static void call_lasr(rocblas_side& side, + rocblas_pivot& pivot, + rocblas_direct& direct, + I& m, + I& n, + S& c, + S& s, + T& A, + I& lda) { I const tid = 0; I const i_inc = 1; - lasr_body(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc); + lasr_body(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc); }; template -static void bdsqr_single_template(char uplo, +static void bdsqr_single_template(rocblas_handle handle, + char uplo, I n, I ncvt, I nru, @@ -1463,32 +933,35 @@ static void bdsqr_single_template(char uplo, auto call_scal_gpu = [=](I n, auto da, T& x, I incx) { scal_template(n, da, &x, incx, stream); }; - auto call_lasr_gpu_nocopy = [=](char const side, char const pivot, char const direct, I const m, - I const n, S& dc, S& ds, T& A, I const lda, hipStream_t stream) { - bool const is_left_side = (side == 'L') || (side == 'l'); - auto const mn = (is_left_side) ? m : n; - auto const mn_m1 = (mn - 1); - - lasr_template_gpu(side, pivot, direct, m, n, &dc, &ds, &A, lda, stream); - }; - - auto call_lasr_gpu - = [=](char const side, char const pivot, char const direct, I const m, I const n, S& c, - S& s, T& A, I const lda, S* const dwork_, hipStream_t stream) { - bool const is_left_side = (side == 'L') || (side == 'l'); + auto call_lasr_gpu_nocopy + = [=](rocblas_side const side, rocblas_pivot const pivot, rocblas_direct const direct, + I const m, I const n, S& dc, S& ds, T& A, I const lda, hipStream_t stream) { + bool const is_left_side = (side == rocblas_side_left); auto const mn = (is_left_side) ? m : n; auto const mn_m1 = (mn - 1); - S* const dc = dwork_; - S* const ds = dwork_ + mn_m1; - CHECK_HIP(hipStreamSynchronize(stream)); - CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); - CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); - - lasr_template_gpu(side, pivot, direct, m, n, dc, ds, &A, lda, stream); - CHECK_HIP(hipStreamSynchronize(stream)); + rocsolver_lasr_template(handle, side, pivot, direct, m, n, &dc, 0, &ds, 0, &A, + 0, lda, 0, I(1)); }; + auto call_lasr_gpu = [=](rocblas_side const side, rocblas_pivot const pivot, + rocblas_direct const direct, I const m, I const n, S& c, S& s, T& A, + I const lda, S* const dwork_, hipStream_t stream) { + bool const is_left_side = (side == rocblas_side_left); + auto const mn = (is_left_side) ? m : n; + auto const mn_m1 = (mn - 1); + S* const dc = dwork_; + S* const ds = dwork_ + mn_m1; + CHECK_HIP(hipStreamSynchronize(stream)); + + CHECK_HIP(hipMemcpyAsync(dc, &c, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); + CHECK_HIP(hipMemcpyAsync(ds, &s, sizeof(S) * mn_m1, hipMemcpyHostToDevice, stream)); + + rocsolver_lasr_template(handle, side, pivot, direct, m, n, dc, 0, ds, 0, &A, 0, lda, + 0, I(1)); + CHECK_HIP(hipStreamSynchronize(stream)); + }; + auto abs = [](auto x) { return (std::abs(x)); }; auto indx2f = [](auto i, auto j, auto ld) -> int64_t { @@ -1639,9 +1112,9 @@ static void bdsqr_single_template(char uplo, if(nru > 0) { // call_lasr( 'r', 'v', 'f', nru, n, work( 1 ), work( n ), u, ldu ); - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; if(use_gpu) { if(use_lasr_gpu_nocopy) @@ -1663,9 +1136,9 @@ static void bdsqr_single_template(char uplo, if(ncc > 0) { // call_lasr( 'l', 'v', 'f', n, ncc, work( 1 ), work( n ), c, ldc ); - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; if(use_gpu) { if(use_lasr_gpu_nocopy) @@ -2095,9 +1568,9 @@ static void bdsqr_single_template(char uplo, { // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( // ll, 1 ), ldvt ) - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2121,9 +1594,9 @@ static void bdsqr_single_template(char uplo, { // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 // ), u( 1, ll ), ldu ) - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2148,9 +1621,9 @@ static void bdsqr_single_template(char uplo, { // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 // ), c( ll, 1 ), ldc ) - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2262,9 +1735,9 @@ static void bdsqr_single_template(char uplo, // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( // nm13+1 // ), vt( ll, 1 ), ldvt ); - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2290,9 +1763,9 @@ static void bdsqr_single_template(char uplo, // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, // ll // ), ldu ) - char side = 'R'; - char pivot = 'V'; - char direct = 'B'; + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2317,9 +1790,9 @@ static void bdsqr_single_template(char uplo, // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, // 1 // ), ldc ) - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2439,9 +1912,9 @@ static void bdsqr_single_template(char uplo, { // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( // ll, 1 ), ldvt ) - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2466,9 +1939,9 @@ static void bdsqr_single_template(char uplo, { // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 // ), u( 1, ll ), ldu ) - char side = 'R'; - char pivot = 'V'; - char direct = 'F'; + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2493,9 +1966,9 @@ static void bdsqr_single_template(char uplo, { // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 // ), c( ll, 1 ), ldc ) - char side = 'L'; - char pivot = 'V'; - char direct = 'F'; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_forward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2616,9 +2089,9 @@ static void bdsqr_single_template(char uplo, // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( // nm13+1 // ), vt( ll, 1 ), ldvt ) - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2644,9 +2117,9 @@ static void bdsqr_single_template(char uplo, // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, // ll // ), ldu ) - char side = 'R'; - char pivot = 'V'; - char direct = 'B'; + rocblas_side side = rocblas_side_right; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -2671,9 +2144,9 @@ static void bdsqr_single_template(char uplo, // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, // 1 // ), ldc ) - char side = 'L'; - char pivot = 'V'; - char direct = 'B'; + rocblas_side side = rocblas_side_left; + rocblas_pivot pivot = rocblas_pivot_variable; + rocblas_direct direct = rocblas_backward_direction; auto mm = m - ll + 1; if(use_gpu) { @@ -3045,13 +2518,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, I ncvt = nv; bool const values_only = (ncvt == 0) && (nru == 0) && (ncc == 0); - bdsqr_single_template(uplo, n, ncvt, nru, ncc, - - d_, e_, - - vt_, ldvt, u_, ldu, c_, ldc, - - work_, info, dwork_, stream); + bdsqr_single_template(handle, uplo, n, ncvt, nru, ncc, d_, e_, vt_, ldvt, u_, ldu, + c_, ldc, work_, info, dwork_, stream); if(info == 0) { From c3af154ba26102f70b83435a98e2601e4b99d7cb Mon Sep 17 00:00:00 2001 From: Troy Alderson <58866654+tfalders@users.noreply.github.com> Date: Tue, 12 Nov 2024 11:43:06 -0700 Subject: [PATCH 28/35] Minor docs fix --- library/include/rocsolver/rocsolver-extra-types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/library/include/rocsolver/rocsolver-extra-types.h b/library/include/rocsolver/rocsolver-extra-types.h index ee2ceb72c..3335f8f3d 100644 --- a/library/include/rocsolver/rocsolver-extra-types.h +++ b/library/include/rocsolver/rocsolver-extra-types.h @@ -164,6 +164,8 @@ typedef enum rocsolver_rfinfo_mode_ = 272, /**< To work with Cholesky factorization (for symmetric positive definite sparse matrices). */ } rocsolver_rfinfo_mode; +/*! \brief Used to specify the planes on which a sequence of Givens rotations is applied. + ********************************************************************************/ typedef enum rocblas_pivot_ { rocblas_pivot_variable = 281, /**< The i-th rotation is applied on plane (i,i+1). */ @@ -187,7 +189,5 @@ typedef enum rocsolver_function_ rocsolver_function_bdsqr = 401, rocsolver_function_gesvd = 402, } rocsolver_function; -/*! \brief Used to specify the planes on which a sequence of Givens rotations is applied. - ********************************************************************************/ #endif /* ROCSOLVER_EXTRA_TYPES_H */ From f4bbed3ba448be33cffa9dc3e120771e1cfcbd5d Mon Sep 17 00:00:00 2001 From: Troy Alderson <58866654+tfalders@users.noreply.github.com> Date: Tue, 12 Nov 2024 15:02:22 -0700 Subject: [PATCH 29/35] Fix benchmark argument consumption error --- clients/common/misc/rocsolver_arguments.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clients/common/misc/rocsolver_arguments.hpp b/clients/common/misc/rocsolver_arguments.hpp index ce6bab452..5e8f14f9b 100644 --- a/clients/common/misc/rocsolver_arguments.hpp +++ b/clients/common/misc/rocsolver_arguments.hpp @@ -114,6 +114,7 @@ class Arguments : private std::map to_consume.erase("batch_count"); to_consume.erase("verify"); to_consume.erase("iters"); + to_consume.erase("alg_mode"); to_consume.erase("mem_query"); to_consume.erase("profile"); to_consume.erase("profile_kernels"); From dbb759ae78bb17392f8493ea806851713482e80c Mon Sep 17 00:00:00 2001 From: Juan Zuniga-Anaya <50754207+jzuniga-amd@users.noreply.github.com> Date: Thu, 14 Nov 2024 10:34:21 -0700 Subject: [PATCH 30/35] review corrections/suggestions 1 (#15) --- clients/benchmarks/client.cpp | 4 ++-- clients/common/misc/rocsolver_arguments.hpp | 1 - docs/reference/auxiliary.rst | 2 +- docs/reference/helpers.rst | 13 +++++++++++-- docs/reference/lapacklike.rst | 2 +- docs/reference/tuning.rst | 14 +++++++++----- docs/reference/types.rst | 4 ++-- docs/sphinx/_toc.yml.in | 2 +- .../include/rocsolver/rocsolver-functions.h | 18 +++++++++--------- .../auxiliary/rocauxiliary_bdsqr_hybrid.hpp | 2 +- 10 files changed, 37 insertions(+), 25 deletions(-) diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 378bd99d1..16a244199 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -127,9 +127,9 @@ try ("alg_mode", value(&argus.alg_mode)->default_value(0), - "Select different algorithm mode? 0 = GPU-only, 1 = Hybrid\n" + "0 = GPU-only, 1 = Hybrid\n" " This will change how the algorithm operates.\n" - " Only applicable to functions with hybrid support." + " Only applicable to functions with hybrid support.\n" " ") ("mem_query", diff --git a/clients/common/misc/rocsolver_arguments.hpp b/clients/common/misc/rocsolver_arguments.hpp index 5e8f14f9b..a8ffd9e73 100644 --- a/clients/common/misc/rocsolver_arguments.hpp +++ b/clients/common/misc/rocsolver_arguments.hpp @@ -33,7 +33,6 @@ #include #include #include -#include #include "program_options.hpp" diff --git a/docs/reference/auxiliary.rst b/docs/reference/auxiliary.rst index 83503e7e2..8bebbf116 100644 --- a/docs/reference/auxiliary.rst +++ b/docs/reference/auxiliary.rst @@ -58,7 +58,7 @@ rocsolver_lacgv() .. doxygenfunction:: rocsolver_zlacgv_64 :outline: .. doxygenfunction:: rocsolver_clacgv_64 - :outline + :outline: .. doxygenfunction:: rocsolver_zlacgv :outline: .. doxygenfunction:: rocsolver_clacgv diff --git a/docs/reference/helpers.rst b/docs/reference/helpers.rst index b643b6807..f8051f03f 100644 --- a/docs/reference/helpers.rst +++ b/docs/reference/helpers.rst @@ -8,7 +8,16 @@ rocSOLVER Library and Logging Functions ***************************************************** +These are helper functions that retrieve information and control some functions of the library. +The helper functions are divided into the following categories: +* :ref:`lib_info`. These functions return information about the library version. +* :ref:`algo_select`. Functions to select different algorithm modes of certain APIs. +* :ref:`api_logging`. These functions control the :ref:`logging-label` capabilities. + + + +.. _lib_info: Library information =============================== @@ -27,6 +36,8 @@ rocsolver_get_version_string_size() +.. _algo_select: + Algorithm selection =============================== @@ -49,8 +60,6 @@ rocsolver_get_alg_mode() Logging functions =============================== -These functions control rocSOLVER's :ref:`logging-label` capabilities. - .. contents:: List of logging functions :local: :backlinks: top diff --git a/docs/reference/lapacklike.rst b/docs/reference/lapacklike.rst index b6b2823c5..68b6881f3 100644 --- a/docs/reference/lapacklike.rst +++ b/docs/reference/lapacklike.rst @@ -547,7 +547,7 @@ rocsolver_syevdx_strided_batched() rocsolver_heevdx() --------------------------------------------------- .. doxygenfunction:: rocsolver_zheevdx - :outline + :outline: .. doxygenfunction:: rocsolver_cheevdx rocsolver_heevdx_batched() diff --git a/docs/reference/tuning.rst b/docs/reference/tuning.rst index 82cba73d2..8b59e356a 100644 --- a/docs/reference/tuning.rst +++ b/docs/reference/tuning.rst @@ -180,14 +180,18 @@ GEBRD_GEBD2_SWITCHSIZE bdsqr function ================== -The Singular Value Decomposition of a bidiagonal matrix could be sped up by splitting the matrix into diagonal blocks -and processing those blocks in parallel. +The Singular Value Decomposition of a bidiagonal matrix could be executed with one or multiple thread blocks, and it is +a blocking API that requires synchronization with the host. -BDSQR_SPLIT_GROUPS +BDSQR_SWITCH_SIZE ------------------- -.. doxygendefine:: BDSQR_SPLIT_GROUPS +.. doxygendefine:: BDSQR_SWITCH_SIZE -(As of the current rocSOLVER release, this constant has not been tuned for any specific cases.) +BDSQR_ITERS_PER_SYNC +---------------------- +.. doxygendefine:: BDSQR_ITERS_PER_SYNC + +(As of the current rocSOLVER release, this constants have not been tuned for any specific cases.) diff --git a/docs/reference/types.rst b/docs/reference/types.rst index 4d881cf18..0fb8d716c 100644 --- a/docs/reference/types.rst +++ b/docs/reference/types.rst @@ -73,8 +73,8 @@ rocsolver_rfinfo_mode rocsolver_alg_mode ------------------------ -.. doxygentypedef:: rocsolver_alg_mode +.. doxygenenum:: rocsolver_alg_mode rocsolver_function ------------------------ -.. doxygentypedef:: rocsolver_function +.. doxygenenum:: rocsolver_function diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index e5d3606ce..6ad32dfe3 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -23,7 +23,7 @@ subtrees: - file: reference/lapack.rst - file: reference/lapacklike.rst - file: reference/refact.rst - - file: reference/logging.rst + - file: reference/helpers.rst - file: reference/tuning.rst - file: reference/deprecated.rst - file: license.rst diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h index 0c7bfd417..3f340ea70 100644 --- a/library/include/rocsolver/rocsolver-functions.h +++ b/library/include/rocsolver/rocsolver-functions.h @@ -147,7 +147,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_log_flush_profile(void); * =========================================================================== */ -/*! \brief SET_ALG_MODE sets the algorithm mode to be used by a specific function. +/*! \brief SET_ALG_MODE sets the algorithm mode to be used by the specified function. @param[in] handle rocblas_handle. @@ -163,17 +163,17 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_set_alg_mode(rocblas_handle handle, const rocsolver_function func, const rocsolver_alg_mode mode); -/*! \brief GET_ALG_MODE gets the algorithm mode selected for use by a specific function. +/*! \brief GET_ALG_MODE gets the algorithm mode being used by the specified function. @param[in] handle rocblas_handle. @param[in] func #rocsolver_function. - A function. + The specified function. @param[out] mode pointer to #rocsolver_alg_mode. - On exit, the value is overwritten by the algorithm mode that will - be used by the specified function. + On exit, the value is overwritten by the algorithm mode used + by the specified function. *************************************************************************/ ROCSOLVER_EXPORT rocblas_status rocsolver_get_alg_mode(rocblas_handle handle, @@ -3871,7 +3871,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zunmtr(rocblas_handle handle, rocblas_handle. \note - A hybrid (CPU+GPU) approach is available for BDSQR. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE" + A hybrid (CPU+GPU) approach is available for BDSQR. Use \ref rocsolver_set_alg_mode to enable it. @param[in] @@ -12613,7 +12613,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zpotri_strided_batched(rocblas_handle within the rocblas_handle. \note - A hybrid (CPU+GPU) approach is available for GESVD. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE" + A hybrid (CPU+GPU) approach is available for GESVD. Use \ref rocsolver_set_alg_mode to enable it. @param[in] @@ -12794,7 +12794,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd(rocblas_handle handle, within the rocblas_handle. \note - A hybrid (CPU+GPU) approach is available for GESVD_BATCHED. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE" + A hybrid (CPU+GPU) approach is available for GESVD_BATCHED. Use \ref rocsolver_set_alg_mode to enable it. @param[in] @@ -13016,7 +13016,7 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd_batched(rocblas_handle handle, within the rocblas_handle. \note - A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED. Use \ref rocsolver_set_alg_mode "SET_ALG_MODE" + A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED. Use \ref rocsolver_set_alg_mode to enable it. @param[in] diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp index 797d40140..180a4fced 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp @@ -5,7 +5,7 @@ * Univ. of Tennessee, Univ. of California Berkeley, * Univ. of Colorado Denver and NAG Ltd.. * June 2017 - * Copyright (C) 2020-2024 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions From 4c6e9e013c3aaabc488f3627cb0fc8936eace6e2 Mon Sep 17 00:00:00 2001 From: Juan Zuniga-Anaya <50754207+jzuniga-amd@users.noreply.github.com> Date: Thu, 14 Nov 2024 22:48:58 -0700 Subject: [PATCH 31/35] code cleaning and rearrangement (#16) * review corrections/suggestions 1 * code cleaning and rearrangement * address review comments --- library/src/auxiliary/rocauxiliary_bdsqr.hpp | 16 +- .../auxiliary/rocauxiliary_bdsqr_hybrid.hpp | 1409 +++-------------- .../src/include/lapack_device_functions.hpp | 92 ++ library/src/include/lapack_host_functions.hpp | 596 +++++++ library/src/include/lib_device_helpers.hpp | 49 + library/src/include/lib_host_helpers.hpp | 76 + 6 files changed, 1032 insertions(+), 1206 deletions(-) create mode 100644 library/src/include/lapack_host_functions.hpp diff --git a/library/src/auxiliary/rocauxiliary_bdsqr.hpp b/library/src/auxiliary/rocauxiliary_bdsqr.hpp index 89397dbc8..c19ae4f9b 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr.hpp @@ -1254,15 +1254,15 @@ rocblas_status rocsolver_bdsqr_template(rocblas_handle handle, ROCSOLVER_LAUNCH_KERNEL((bdsqr_init), gridBasic, threadsBasic, 0, stream, n, D, strideD, E, strideE, info, maxiter, sfm, tol, splits_map, work, strideW, completed); - if(alg_mode == rocsolver_alg_mode_hybrid) + if(n > 1) { - ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template( - handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U, shiftU, - ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work)); - } - else - { - if(n > 1) + if(alg_mode == rocsolver_alg_mode_hybrid) + { + ROCBLAS_CHECK(rocsolver_bdsqr_host_batch_template( + handle, uplo, n, nv, nu, nc, D, strideD, E, strideE, V, shiftV, ldv, strideV, U, + shiftU, ldu, strideU, C, shiftC, ldc, strideC, info, batch_count, splits_map, work)); + } + else { // rotate to upper bidiagonal if necessary if(uplo == rocblas_fill_lower) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp index 180a4fced..56870cb44 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp @@ -33,803 +33,11 @@ #pragma once -#include -#include -#include -#include -#include - +#include "lapack_host_functions.hpp" #include "rocauxiliary_lasr.hpp" -#include "hip/hip_runtime.h" -#include "hip/hip_runtime_api.h" - ROCSOLVER_BEGIN_NAMESPACE -#ifndef CHECK_HIP -#define CHECK_HIP(fcn) \ - { \ - hipError_t const istat = (fcn); \ - assert(istat == hipSuccess); \ - } -#endif - -template -__global__ static void - rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s) -{ - if(n <= 0) - return; - - I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - I const i_inc = hipBlockDim_x * hipGridDim_x; - - if((incx == 1) && (incy == 1)) - { - // ------------ - // special case - // ------------ - for(I i = i_start; i < n; i += i_inc) - { - auto const temp = c * x[i] + s * y[i]; - y[i] = c * y[i] - s * x[i]; - x[i] = temp; - } - } - else - { - // --------------------------- - // code for unequal increments - // --------------------------- - - for(auto i = i_start; i < n; i += i_inc) - { - auto const ix = 0 + i * static_cast(incx); - auto const iy = 0 + i * static_cast(incy); - auto const temp = c * x[ix] + s * y[iy]; - y[iy] = c * y[iy] - s * x[ix]; - x[ix] = temp; - } - } -} - -template -static void - rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream) -{ - auto nthreads = warpSize * 2; - auto nblocks = (n - 1) / nthreads + 1; - - hipLaunchKernelGGL((rot_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, - n, x, incx, y, incy, c, s); -} - -template -__global__ static void scal_kernel(I const n, S const da, T* const x, I const incx) -{ - if(n <= 0) - return; - - I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - I const i_inc = hipBlockDim_x * hipGridDim_x; - - S const zero = 0; - bool const is_da_zero = (da == zero); - if(incx == 1) - { - for(I i = i_start; i < n; i += i_inc) - { - x[i] = (is_da_zero) ? zero : da * x[i]; - } - } - else - { - // --------------------------- - // code for non-unit increments - // --------------------------- - - for(I i = i_start; i < n; i += i_inc) - { - auto const ix = 0 + i * static_cast(incx); - x[ix] = (is_da_zero) ? zero : da * x[ix]; - } - } -} - -template -static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream) -{ - auto nthreads = warpSize * 2; - auto nblocks = (n - 1) / nthreads + 1; - - hipLaunchKernelGGL((scal_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, - n, da, x, incx); -} - -template -__global__ static void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy) -{ - if(n <= 0) - return; - - I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - I const i_inc = hipBlockDim_x * hipGridDim_x; - - if((incx == 1) && (incy == 1)) - { - // ------------ - // special case - // ------------ - for(I i = i_start; i < n; i += i_inc) - { - auto const temp = y[i]; - y[i] = x[i]; - x[i] = temp; - } - } - else - { - // --------------------------- - // code for unequal increments - // --------------------------- - - for(I i = i_start; i < n; i += i_inc) - { - auto const ix = 0 + i * static_cast(incx); - auto const iy = 0 + i * static_cast(incy); - - auto const temp = y[iy]; - y[iy] = x[ix]; - x[ix] = temp; - } - } -} - -template -static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream) -{ - auto nthreads = warpSize * 2; - auto nblocks = (n - 1) / nthreads + 1; - - hipLaunchKernelGGL((swap_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, - n, x, incx, y, incy); -} - -static void call_lamch(char& cmach, double& eps) -{ - eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() - : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() - : ((cmach == 'B') || (cmach == 's')) ? FLT_RADIX - : std::numeric_limits::min(); -} - -static void call_lamch(char& cmach, float& eps) -{ - eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() - : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() - : ((cmach == 'B') || (cmach == 's')) ? FLT_RADIX - : std::numeric_limits::min(); -} - -template -static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy) -{ - T* const x = &(x_in); - T* const y = &(y_in); - for(I i = 0; i < n; i++) - { - I const ix = i * incx; - I const iy = i * incy; - - T const temp = x[ix]; - x[ix] = y[iy]; - y[iy] = temp; - } -} - -template -static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax) -{ - T const zero = 0; - T const one = 1; - T const two = 2; - - T as, at, au, c, fa, fhmn, fhmx, ga, ha; - - auto abs = [](auto x) { return (std::abs(x)); }; - auto min = [](auto x, auto y) { return ((x < y) ? x : y); }; - auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; - auto sqrt = [](auto x) { return (std::sqrt(x)); }; - auto square = [](auto x) { return (x * x); }; - - fa = abs(f); - ga = abs(g); - ha = abs(h); - fhmn = min(fa, ha); - fhmx = max(fa, ha); - if(fhmn == zero) - { - ssmin = zero; - if(fhmx == zero) - { - ssmax = ga; - } - else - { - // ssmax = max( fhmx, ga )*sqrt( one+ ( min( fhmx, ga ) / max( fhmx, ga ) )**2 ); - ssmax = max(fhmx, ga) * sqrt(one + square(min(fhmx, ga) / max(fhmx, ga))); - } - } - else - { - if(ga < fhmx) - { - as = one + fhmn / fhmx; - at = (fhmx - fhmn) / fhmx; - au = square(ga / fhmx); - c = two / (sqrt(as * as + au) + sqrt(at * at + au)); - ssmin = fhmn * c; - ssmax = fhmx / c; - } - else - { - au = fhmx / ga; - if(au == zero) - { - // - // avoid possible harmful underflow if exponent range - // asymmetric (true ssmin may not underflow even if - // au underflows) - // - ssmin = (fhmn * fhmx) / ga; - ssmax = ga; - } - else - { - as = one + fhmn / fhmx; - at = (fhmx - fhmn) / fhmx; - // c = one / ( sqrt( one+( as*au )**2 )+ sqrt( one+( at*au )**2 ) ); - c = one / (sqrt(one + square(as * au)) + sqrt(one + square(at * au))); - ssmin = (fhmn * c) * au; - ssmin = ssmin + ssmin; - ssmax = ga / (c + c); - } - } - } -} - -static float real_part(float z) -{ - return (z); -}; -static float real_part(std::complex z) -{ - return (z.real()); -}; -static float real_part(rocblas_complex_num z) -{ - return (z.real()); -}; - -static double real_part(double z) -{ - return (z); -}; -static double real_part(std::complex z) -{ - return (z.real()); -}; -static double real_part(rocblas_complex_num z) -{ - return (z.real()); -}; - -static float imag_part(float z) -{ - return (0); -}; -static float imag_part(std::complex z) -{ - return (z.imag()); -}; -static float imag_part(rocblas_complex_num z) -{ - return (z.imag()); -}; - -static double imag_part(double z) -{ - return (0); -}; -static double imag_part(std::complex z) -{ - return (z.imag()); -}; -static double imag_part(rocblas_complex_num z) -{ - return (z.imag()); -}; - -template -static void call_lartg(T& f, T& g, S& cs, T& sn, T& r) -{ - // ------------------------------------------------------ - // lartg generates a plane rotation so that - // [ cs sn ] * [ f ] = [ r ] - // [ -sn cs ] [ g ] [ 0 ] - // - // where cs * cs + abs(sn)*abs(sn) == 1 - // if g == 0, then cs == 1, sn == 0 - // if f == 0, then cs = 0, sn is chosen so that r is real - // ------------------------------------------------------ - - auto Not = [](bool x) { return (!x); }; - auto abs = [](auto x) { return (std::abs(x)); }; - auto dble = [](auto z) { return (static_cast(real_part(z))); }; - auto dimag = [](auto z) { return (static_cast(imag_part(z))); }; - auto log = [](auto x) { return (std::log(x)); }; - auto sqrt = [](auto x) { return (std::sqrt(x)); }; - auto max = [](auto x, auto y) { return ((x > y) ? x : y); }; - auto disnan = [](auto x) -> bool { return (isnan(x)); }; - auto dcmplx = [](auto x, auto y) -> T { - bool constexpr is_complex_type - = !(std::is_same::value || std::is_same::value); - - if constexpr(is_complex_type) - { - return (T(x, y)); - } - else - { - return (T(x)); - }; - }; - auto dconjg = [&](auto z) { return (dcmplx(dble(z), -dimag(z))); }; - - auto square = [](auto x) { return (x * x); }; - - auto abs1 = [&](auto ff) { return (max(abs(dble(ff)), abs(dimag(ff)))); }; - auto abssq = [&](auto ff) { return (square(dble(ff)) + square(dimag(ff))); }; - - // ----------------------------------------- - // compute sqrt( x * x + y * y ) - // without unnecessary overflow or underflow - // ----------------------------------------- - auto dlapy2 = [&](auto x, auto y) { - auto const one = 1; - auto const zero = 0; - - auto ddlapy2 = x; - bool const x_is_nan = disnan(x); - bool const y_is_nan = disnan(y); - if(x_is_nan) - ddlapy2 = x; - if(y_is_nan) - ddlapy2 = y; - - if(Not(x_is_nan || y_is_nan)) - { - auto const xabs = abs(x); - auto const yabs = abs(y); - auto const w = max(xabs, yabs); - auto const z = min(xabs, yabs); - if(z == zero) - { - ddlapy2 = w; - } - else - { - ddlapy2 = w * sqrt(one + square(z / w)); - } - } - return (ddlapy2); - }; - - char cmach = 'E'; - S const zero = 0; - S const one = 1; - S const two = 2; - T const czero = 0; - - bool has_work; - bool first; - int count, i; - S d, di, dr, eps, f2, f2s, g2, g2s, safmin; - S safmn2, safmx2, scale; - T ff, fs, gs; - - // safmin = dlamch( 's' ) - cmach = 'S'; - call_lamch(cmach, safmin); - - // eps = dlamch( 'e' ) - cmach = 'E'; - call_lamch(cmach, eps); - - // safmn2 = dlamch( 'b' )**int( log( safmin / eps ) / log( dlamch( 'b' ) ) / two ) - cmach = 'B'; - S radix = 2; - call_lamch(cmach, radix); - - int const npow = (log(safmin / eps) / log(radix) / two); - safmn2 = std::pow(radix, npow); - safmx2 = one / safmn2; - scale = max(abs1(f), abs1(g)); - fs = f; - gs = g; - count = 0; - - if(scale >= safmx2) - { - L10: - do - { - count = count + 1; - fs = fs * safmn2; - gs = gs * safmn2; - scale = scale * safmn2; - // if( (scale >= safmx2) && (count < 20) ) go to L10 - has_work = ((scale >= safmx2) && (count < 20)); - } while(has_work); - } - else - { - if(scale <= safmn2) - { - if((g == czero) || disnan(abs(g))) - { - cs = one; - sn = czero; - r = f; - return; - } - L20: - do - { - count = count - 1; - fs = fs * safmx2; - gs = gs * safmx2; - scale = scale * safmx2; - // if( scale <= safmn2 ) goto L20; - has_work = (scale <= safmn2); - } while(has_work); - } - f2 = abssq(fs); - g2 = abssq(gs); - if(f2 <= max(g2, one) * safmin) - { - // - // this is a rare case: f is very small. - // - if(f == czero) - { - cs = zero; - r = dlapy2(dble(g), dimag(g)); - // do complex/real division explicitly with two real divisions - d = dlapy2(dble(gs), dimag(gs)); - sn = dcmplx(dble(gs) / d, -dimag(gs) / d); - return; - } - f2s = dlapy2(dble(fs), dimag(fs)); - // g2 and g2s are accurate - // g2 is at least safmin, and g2s is at least safmn2 - g2s = sqrt(g2); - // error in cs from underflow in f2s is at most - // unfl / safmn2 < sqrt(unfl*eps) .lt. eps - // if max(g2,one)=g2, then f2 < g2*safmin, - // and so cs < sqrt(safmin) - // if max(g2,one)=one, then f2 < safmin - // and so cs < sqrt(safmin)/safmn2 = sqrt(eps) - // therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s - cs = f2s / g2s; - // make sure abs(ff) = 1 - // do complex/real division explicitly with 2 real divisions - if(abs1(f) > one) - { - d = dlapy2(dble(f), dimag(f)); - ff = dcmplx(dble(f) / d, dimag(f) / d); - } - else - { - dr = safmx2 * dble(f); - di = safmx2 * dimag(f); - d = dlapy2(dr, di); - ff = dcmplx(dr / d, di / d); - } - sn = ff * dcmplx(dble(gs) / g2s, -dimag(gs) / g2s); - r = cs * f + sn * g; - } - else - { - // - // this is the most common case. - // neither f2 nor f2/g2 are less than safmin - // f2s cannot overflow, and it is accurate - // - f2s = sqrt(one + g2 / f2); - // do the f2s(real)*fs(complex) multiply with two real multiplies - r = dcmplx(f2s * dble(fs), f2s * dimag(fs)); - cs = one / f2s; - d = f2 + g2; - // do complex/real division explicitly with two real divisions - sn = dcmplx(dble(r) / d, dimag(r) / d); - sn = sn * dconjg(gs); - if(count != 0) - { - if(count > 0) - { - for(i = 1; i <= count; i++) - { - r = r * safmx2; - }; - } - else - { - for(i = 1; i <= -count; i++) - { - r = r * safmn2; - } - } - } - } - } -} - -template -static void call_scal(I& n, S& a, T& x_in, I& incx) -{ - bool const is_zero = (a == 0); - T* const x = &x_in; - for(I i = 0; i < n; i++) - { - auto const ip = i * incx; - if(is_zero) - { - x[ip] = 0; - } - else - { - x[ip] *= a; - } - }; -} - -template -static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s) -{ - T* const x = &(x_in); - T* const y = &(y_in); - - for(I i = 0; i < n; i++) - { - auto const ix = i * incx; - auto const iy = i * incy; - - auto const temp = c * x[ix] + s * y[iy]; - y[iy] = c * y[iy] - s * x[ix]; - x[ix] = temp; - } -} - -// -------------------------------------------------------- -// lasv2 computes the singular value decomposition of a 2 x 2 -// triangular matrix -// [ F G ] -// [ 0 H ] -// -// on return, -// abs(ssmax) is the larger singular value, -// abs(ssmin) is the smaller singular value, -// (csl,snl) and (csr,snr) are the left and right -// singular vectors for abs(ssmax) -// -// [ csl snl] [ F G ] [ csr -snr] = [ ssmax 0 ] -// [-snl csl] [ 0 H ] [ snr csr] [ 0 ssmin ] -// -------------------------------------------------------- -template -static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& snl, T& csl) -{ - T const zero = 0; - T const one = 1; - T const two = 2; - T const four = 4; - T const half = one / two; - - bool gasmal; - bool swap; - int pmax; - char cmach; - - T a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m; - T mm, r, s, slt, srt, t, temp, tsign, tt; - T macheps; - - auto abs = [](auto x) { return (std::abs(x)); }; - auto sqrt = [](auto x) { return (std::sqrt(x)); }; - auto sign = [](auto a, auto b) { - auto const abs_a = std::abs(a); - return ((b >= 0) ? abs_a : -abs_a); - }; - - ft = f; - fa = abs(ft); - ht = h; - ha = abs(h); - // - // pmax points to the maximum absolute element of matrix - // pmax = 1 if f largest in absolute values - // pmax = 2 if g largest in absolute values - // pmax = 3 if h largest in absolute values - // - pmax = 1; - swap = (ha > fa); - if(swap) - { - pmax = 3; - temp = ft; - ft = ht; - ht = temp; - temp = fa; - fa = ha; - ha = temp; - // - // now fa >= ha - // - } - gt = g; - ga = abs(gt); - if(ga == zero) - { - // - // diagonal matrix - // - ssmin = ha; - ssmax = fa; - clt = one; - crt = one; - slt = zero; - srt = zero; - } - else - { - gasmal = true; - if(ga > fa) - { - pmax = 2; - - cmach = 'E'; - call_lamch(cmach, macheps); - - if((fa / ga) < macheps) - { - // - // case of very large ga - // - gasmal = false; - ssmax = ga; - if(ha > one) - { - ssmin = fa / (ga / ha); - } - else - { - ssmin = (fa / ga) * ha; - } - clt = one; - slt = ht / gt; - srt = one; - crt = ft / gt; - } - } - if(gasmal) - { - // - // normal case - // - d = fa - ha; - if(d == fa) - { - // - // copes with infinite f or h - // - l = one; - } - else - { - l = d / fa; - } - // - // note that 0 <= l <= 1 - // - m = gt / ft; - // - // note that abs(m) <= 1/macheps - // - t = two - l; - // - // note that t >= 1 - // - mm = m * m; - tt = t * t; - s = sqrt(tt + mm); - // - // note that 1 <= s <= 1 + 1/macheps - // - if(l == zero) - { - r = abs(m); - } - else - { - r = sqrt(l * l + mm); - } - // - // note that 0 <= r .le. 1 + 1/macheps - // - a = half * (s + r); - // - // note that 1 <= a .le. 1 + abs(m) - // - ssmin = ha / a; - ssmax = fa * a; - if(mm == zero) - { - // - // note that m is very tiny - // - if(l == zero) - { - t = sign(two, ft) * sign(one, gt); - } - else - { - t = gt / sign(d, ft) + m / t; - } - } - else - { - t = (m / (s + t) + m / (r + l)) * (one + a); - } - l = sqrt(t * t + four); - crt = two / l; - srt = t / l; - clt = (crt + srt * m) / a; - slt = (ht / ft) * srt / a; - } - } - if(swap) - { - csl = srt; - snl = crt; - csr = slt; - snr = clt; - } - else - { - csl = clt; - snl = slt; - csr = crt; - snr = srt; - } - // - // correct signs of ssmax and ssmin - // - if(pmax == 1) - { - tsign = sign(one, csr) * sign(one, csl) * sign(one, f); - } - if(pmax == 2) - { - tsign = sign(one, snr) * sign(one, csl) * sign(one, g); - } - if(pmax == 3) - { - tsign = sign(one, snr) * sign(one, snl) * sign(one, h); - } - ssmax = sign(ssmax, tsign); - ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h)); -} - template static void call_lasr(rocblas_side& side, rocblas_pivot& pivot, @@ -845,7 +53,11 @@ static void call_lasr(rocblas_side& side, I const i_inc = 1; lasr_body(side, pivot, direct, m, n, &c, &s, &A, lda, tid, i_inc); -}; +} + +/************************************************************************************/ +/***************** Main template functions ******************************************/ +/************************************************************************************/ template static void bdsqr_single_template(rocblas_handle handle, @@ -854,74 +66,22 @@ static void bdsqr_single_template(rocblas_handle handle, I ncvt, I nru, I ncc, - S* d_, S* e_, - T* vt_, I ldvt, T* u_, I ldu, T* c_, I ldc, - S* work_, I& info, S* dwork_ = nullptr, hipStream_t stream = 0) { - bool const use_gpu = (dwork_ != nullptr); - - // ----------------------------------- - // Lapack code used O(n^2) algorithm for sorting - // Consider turning off this and rely on - // bdsqr_sort() to perform sorting - // ----------------------------------- - bool constexpr need_sort = false; - - S const zero = 0; - S const one = 1; - S negone = -1; - S const hndrd = 100; - S const hndrth = one / hndrd; - S const ten = 10; - S const eight = 8; - S const meight = -one / eight; - I const maxitr = 6; - I ione = 1; - - bool const lower = (uplo == 'L') || (uplo == 'l'); - bool const upper = (uplo == 'U') || (uplo == 'u'); - /* - * rotate is true if any singular vectors desired, false otherwise - */ - bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0); - - I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0, - maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0; - - I const nrc = n; // number of rows in C matrix - I const nrvt = n; // number of rows in VT matrix - I const ncu = n; // number of columns in U matrix - - S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, h = 0, mu = 0, - oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, sinl = 0, sinr = 0, sll = 0, - smax = 0, smin = 0, sminl = 0, sminoa = 0, sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0; - - /* .. - * .. external functions .. - logical lsame - double precision dlamch - external lsame, dlamch - * .. - * .. external subroutines .. - external dlartg, dlas2, dlasq1, dlasr, dlasv2, drot, - $ dscal, dswap, xerbla - * .. - * .. intrinsic functions .. - intrinsic abs, dble, max, min, sign, sqrt - */ - + // ------------------------------------- + // Lambda expressions used as helpers + // ------------------------------------- auto call_swap_gpu = [=](I n, T& x, I incx, T& y, I incy) { swap_template(n, &x, incx, &y, incy, stream); }; @@ -962,8 +122,6 @@ static void bdsqr_single_template(rocblas_handle handle, CHECK_HIP(hipStreamSynchronize(stream)); }; - auto abs = [](auto x) { return (std::abs(x)); }; - auto indx2f = [](auto i, auto j, auto ld) -> int64_t { assert((1 <= i) && (i <= ld)); assert((1 <= j)); @@ -979,7 +137,9 @@ static void bdsqr_single_template(rocblas_handle handle, assert((1 <= i) && (i <= (n - 1))); return (e_[i - 1]); }; + auto work = [=](auto i) -> S& { return (work_[i - 1]); }; + auto dwork = [=](auto i) -> S& { return (dwork_[i - 1]); }; auto c = [=](auto i, auto j) -> T& { @@ -1000,73 +160,73 @@ static void bdsqr_single_template(rocblas_handle handle, return (vt_[indx2f(i, j, ldvt)]); }; - // --------------------------- - // emulate Fortran intrinsics - // --------------------------- auto sign = [](auto a, auto b) { auto const abs_a = std::abs(a); return ((b >= 0) ? abs_a : -abs_a); }; auto dble = [](auto x) { return (static_cast(x)); }; + // ------------------------------- - auto max = [](auto a, auto b) { return ((a > b) ? a : b); }; + // ---------------- + // Initialization + // ---------------- + bool const use_gpu = (dwork_ != nullptr); - auto min = [](auto a, auto b) { return ((a < b) ? a : b); }; + // Lapack code used O(n^2) algorithm for sorting + // Consider turning off this and rely on + // bdsqr_sort() to perform sorting + bool constexpr need_sort = false; - auto sqrt = [](auto x) { return (std::sqrt(x)); }; + S const zero = 0; + S const one = 1; + S negone = -1; + S const hndrd = 100; + S const hndrth = one / hndrd; + S const ten = 10; + S const eight = 8; + S const meight = -one / eight; + I const maxitr = 6; + I ione = 1; - /* .. - * .. executable statements .. - * - * test the input parameters. - * - */ + bool const lower = (uplo == 'L') || (uplo == 'l'); + bool const upper = (uplo == 'U') || (uplo == 'u'); + + //rotate is true if any singular vectors desired, false otherwise + bool const rotate = (ncvt > 0) || (nru > 0) || (ncc > 0); - info = (!upper) && (!lower) ? -1 - : (n < 0) ? -2 - : (ncvt < 0) ? -3 - : (nru < 0) ? -4 - : (ncc < 0) ? -5 - : ((ncvt == 0) && (ldvt < 1)) || ((ncvt > 0) && (ldvt < max(1, n)) ? -9 : (ldu < max(1, nru))) - ? -11 - : ((ncc == 0) && (ldc < 1)) || ((ncc > 0) && (ldc < max(1, n))) ? -13 - : 0; + I i = 0, idir = 0, isub = 0, iter = 0, iterdivn = 0, j = 0, ll = 0, lll = 0, m = 0, + maxitdivn = 0, nm1 = 0, nm12 = 0, nm13 = 0, oldll = 0, oldm = 0; - if(info != 0) - return; + I const nrc = n; // number of rows in C matrix + I const nrvt = n; // number of rows in VT matrix + I const ncu = n; // number of columns in U matrix - if(n == 0) - return; + S abse = 0, abss = 0, cosl = 0, cosr = 0, cs = 0, eps = 0, f = 0, g = 0, h = 0, mu = 0, + oldcs = 0, oldsn = 0, r = 0, shift = 0, sigmn = 0, sigmx = 0, sinl = 0, sinr = 0, sll = 0, + smax = 0, smin = 0, sminl = 0, sminoa = 0, sn = 0, thresh = 0, tol = 0, tolmul = 0, unfl = 0; bool const need_update_singular_vectors = (nru > 0) || (ncc > 0); bool constexpr use_lasr_gpu_nocopy = false; - if(n == 1) - goto L160; - nm1 = n - 1; nm12 = nm1 + nm1; nm13 = nm12 + nm1; idir = 0; - /* - * get machine constants - * - */ + + //get machine constants { char cmach_eps = 'E'; char cmach_unfl = 'S'; call_lamch(cmach_eps, eps); call_lamch(cmach_unfl, unfl); } - /* - * if matrix lower bidiagonal, rotate to be upper bidiagonal - * by applying givens rotations on the left - */ + // ----------------------------------------- + // rotate to upper bidiagonal if necesarry + // ----------------------------------------- if(lower) { - // do 10 i = 1, n - 1 for(i = 1; i <= (n - 1); i++) { call_lartg(d(i), e(i), cs, sn, r); @@ -1076,21 +236,15 @@ static void bdsqr_single_template(rocblas_handle handle, work(i) = cs; work(nm1 + i) = sn; } - L10: - - // ---------------------------------- - // update singular vectors if desired - // ---------------------------------- + // update singular vectors if desired if(use_lasr_gpu_nocopy) { CHECK_HIP(hipStreamSynchronize(stream)); if(need_update_singular_vectors) { - // -------------- // copy rotations - // -------------- size_t const nbytes = sizeof(S) * (n - 1); hipMemcpyKind const kind = hipMemcpyHostToDevice; @@ -1158,96 +312,68 @@ static void bdsqr_single_template(rocblas_handle handle, } } } - /* - * compute singular values to relative accuracy tol - * (by setting tol to be negative, algorithm will compute - * singular values to absolute accuracy abs(tol)*norm(input matrix)) - */ - tolmul = max(ten, min(hndrd, pow(eps, meight))); + // ------------------------------------------------------------- + // Compute singular values and vector to relative accuracy tol + // ------------------------------------------------------------- + tolmul = std::max(ten, std::min(hndrd, pow(eps, meight))); tol = tolmul * eps; - /* - * compute approximate maximum, minimum singular values - */ - - /* - smax = zero - do 20 i = 1, n - smax = max( smax, abs( d( i ) ) ) - L20: - do 30 i = 1, n - 1 - smax = max( smax, abs( e( i ) ) ) - L30: - */ + // compute approximate maximum, minimum singular values smax = zero; - // do 20 i = 1, n for(i = 1; i <= n; i++) { - smax = max(smax, abs(d(i))); + smax = std::max(smax, std::abs(d(i))); } L20: - // do 30 i = 1, n - 1 for(i = 1; i <= (n - 1); i++) { - smax = max(smax, abs(e(i))); + smax = std::max(smax, std::abs(e(i))); } -L30: + // compute tolerance +L30: sminl = zero; if(tol >= zero) { - /* - * relative accuracy desired - */ - - sminoa = abs(d(1)); + // relative accuracy desired + sminoa = std::abs(d(1)); if(sminoa == zero) goto L50; mu = sminoa; - // do 40 i = 2, n for(i = 2; i <= n; i++) { - mu = abs(d(i)) * (mu / (mu + abs(e(i - 1)))); - sminoa = min(sminoa, mu); + mu = std::abs(d(i)) * (mu / (mu + std::abs(e(i - 1)))); + sminoa = std::min(sminoa, mu); if(sminoa == zero) goto L50; } L40: L50: - - sminoa = sminoa / sqrt(dble(n)); - thresh = max(tol * sminoa, ((unfl * n) * n) * maxitr); + sminoa = sminoa / std::sqrt(dble(n)); + thresh = std::max(tol * sminoa, ((unfl * n) * n) * maxitr); } else { - /* - * absolute accuracy desired - */ - - thresh = max(abs(tol) * smax, ((unfl * n) * n) * maxitr); + //absolute accuracy desired + thresh = std::max(std::abs(tol) * smax, ((unfl * n) * n) * maxitr); } - /* - * prepare for main iteration loop for the singular values - * (maxit is the maximum number of passes through the inner - * loop permitted before nonconvergence signalled.) - */ + + /** prepare for main iteration loop for the singular values + (maxit is the maximum number of passes through the inner + loop permitted before nonconvergence signalled.) **/ maxitdivn = maxitr * n; iterdivn = 0; iter = -1; oldll = -1; oldm = -1; - /* - * m points to last element of unconverged part of matrix - */ m = n; - /* - * begin main iteration loop - */ + + /////////////////////////// + /// MAIN ITERATION LOOP /// + /////////////////////////// L60: - /* - * check for convergence or exceeding iteration count - */ + // check for convergence or exceeding iteration count if(m <= 1) goto L160; @@ -1258,60 +384,52 @@ static void bdsqr_single_template(rocblas_handle handle, if(iterdivn >= maxitdivn) goto L200; } - /* - * find diagonal block of matrix to work on - */ - if(tol < zero && abs(d(m)) <= thresh) + + // find diagonal block of matrix to work on + if(tol < zero && std::abs(d(m)) <= thresh) d(m) = zero; - smax = abs(d(m)); + smax = std::abs(d(m)); smin = smax; - // do 70 lll = 1, m - 1 for(lll = 1; lll <= (m - 1); lll++) { ll = m - lll; - abss = abs(d(ll)); - abse = abs(e(ll)); + abss = std::abs(d(ll)); + abse = std::abs(e(ll)); if(tol < zero && abss <= thresh) d(ll) = zero; if(abse <= thresh) goto L80; - smin = min(smin, abss); - smax = max(smax, max(abss, abse)); + smin = std::min(smin, abss); + smax = std::max(smax, std::max(abss, abse)); } + L70: ll = 0; goto L90; + L80: e(ll) = zero; - /* - * matrix splits since e(ll) = 0 - */ + // matrix splits since e(ll) = 0 if(ll == m - 1) { - /* - * convergence of bottom singular value, return to top of loop - */ + // convergence of bottom singular value, return to top of loop m = m - 1; goto L60; } + L90: ll = ll + 1; - /* - * e(ll) through e(m-1) are nonzero, e(ll-1) is zero - */ + // e(ll) through e(m-1) are nonzero, e(ll-1) is zero if(ll == m - 1) { - /* - * 2 by 2 block, handle separately - */ + // 2 by 2 block, handle separately call_lasv2(d(m - 1), e(m - 1), d(m), sigmn, sigmx, sinr, cosr, sinl, cosl); d(m - 1) = sigmx; e(m - 1) = zero; d(m) = sigmn; - /* - * compute singular vectors, if desired - */ + + // compute singular vectors, if desired if(ncvt > 0) { if(use_gpu) @@ -1348,37 +466,30 @@ static void bdsqr_single_template(rocblas_handle handle, m = m - 2; goto L60; } - /* - * if working on new submatrix, choose shift direction - * (from larger end diagonal element towards smaller) - */ + + /** if working on new submatrix, choose shift direction + (from larger end diagonal element towards smaller) **/ if(ll > oldm || m < oldll) { - if(abs(d(ll)) >= abs(d(m))) + if(std::abs(d(ll)) >= std::abs(d(m))) { - /* - * chase bulge from top (big end) to bottom (small end) - */ + // chase bulge from top (big end) to bottom (small end) idir = 1; } else { - /* - * chase bulge from bottom (big end) to top (small end) - */ + // chase bulge from bottom (big end) to top (small end) idir = 2; } } - /* - * apply convergence tests - */ + + // apply convergence test if(idir == 1) { - /* - * run convergence test in forward direction - * first apply standard test to bottom of matrix - */ - if(abs(e(m - 1)) <= abs(tol) * abs(d(m)) || (tol < zero && abs(e(m - 1)) <= thresh)) + // run convergence test in forward direction + // first apply standard test to bottom of matrix + if(std::abs(e(m - 1)) <= std::abs(tol) * std::abs(d(m)) + || (tol < zero && std::abs(e(m - 1)) <= thresh)) { e(m - 1) = zero; goto L60; @@ -1386,33 +497,28 @@ static void bdsqr_single_template(rocblas_handle handle, if(tol >= zero) { - /* - * if relative accuracy desired, - * apply convergence criterion forward - */ - mu = abs(d(ll)); + // if relative accuracy desired, + // apply convergence criterion forward + mu = std::abs(d(ll)); sminl = mu; - // do 100 lll = ll, m - 1 for(lll = ll; lll <= (m - 1); lll++) { - if(abs(e(lll)) <= tol * mu) + if(std::abs(e(lll)) <= tol * mu) { e(lll) = zero; goto L60; } - mu = abs(d(lll + 1)) * (mu / (mu + abs(e(lll)))); - sminl = min(sminl, mu); + mu = std::abs(d(lll + 1)) * (mu / (mu + std::abs(e(lll)))); + sminl = std::min(sminl, mu); } - // L100: } } else { - /* - * run convergence test in backward direction - * first apply standard test to top of matrix - */ - if(abs(e(ll)) <= abs(tol) * abs(d(ll)) || (tol < zero && abs(e(ll)) <= thresh)) + // run convergence test in backward direction + // first apply standard test to top of matrix + if(std::abs(e(ll)) <= std::abs(tol) * std::abs(d(ll)) + || (tol < zero && std::abs(e(ll)) <= thresh)) { e(ll) = zero; goto L60; @@ -1420,81 +526,65 @@ static void bdsqr_single_template(rocblas_handle handle, if(tol >= zero) { - /* - * if relative accuracy desired, - * apply convergence criterion backward - */ - mu = abs(d(m)); + // if relative accuracy desired, + // apply convergence criterion backward + mu = std::abs(d(m)); sminl = mu; - // do 110 lll = m - 1, ll, -1 for(lll = (m - 1); lll >= ll; lll--) { - if(abs(e(lll)) <= tol * mu) + if(std::abs(e(lll)) <= tol * mu) { e(lll) = zero; goto L60; } - mu = abs(d(lll)) * (mu / (mu + abs(e(lll)))); - sminl = min(sminl, mu); + mu = std::abs(d(lll)) * (mu / (mu + std::abs(e(lll)))); + sminl = std::min(sminl, mu); } - // L110: } } + + /** compute shift. first, test if shifting would ruin relative + accuracy, and if so set the shift to zero **/ oldll = ll; oldm = m; - /* - * compute shift. first, test if shifting would ruin relative - * accuracy, and if so set the shift to zero. - */ - if(tol >= zero && n * tol * (sminl / smax) <= max(eps, hndrth * tol)) + if(tol >= zero && n * tol * (sminl / smax) <= std::max(eps, hndrth * tol)) { - /* - * use a zero shift to avoid loss of relative accuracy - */ + //use a zero shift to avoid loss of relative accuracy shift = zero; } else { - /* - * compute the shift from 2-by-2 block at end of matrix - */ + // compute the shift from 2-by-2 block at end of matrix if(idir == 1) { - sll = abs(d(ll)); + sll = std::abs(d(ll)); call_las2(d(m - 1), e(m - 1), d(m), shift, r); } else { - sll = abs(d(m)); + sll = std::abs(d(m)); call_las2(d(ll), e(ll), d(ll + 1), shift, r); } - /* - * test if shift negligible, and if so set to zero - */ + // test if shift negligible, and if so set to zero if(sll > zero) { if((shift / sll) * (shift / sll) < eps) shift = zero; } } - /* - * increment iteration count - */ + + // increment iteration count iter = iter + m - ll; - /* - * if shift = 0, do simplified qr iteration - */ + + // if shift = 0, do simplified qr iteration if(shift == zero) { if(idir == 1) { - /* - * chase bulge from top to bottom - * save cosines and sines for later singular vector updates - */ + // chase bulge from top to bottom + // save cosines and sines for later singular vector updates cs = one; oldcs = one; - // do 120 i = ll, m - 1 for(i = ll; i <= (m - 1); i++) { auto di_cs = d(i) * cs; @@ -1509,24 +599,20 @@ static void bdsqr_single_template(rocblas_handle handle, work(i - ll + 1 + nm12) = oldcs; work(i - ll + 1 + nm13) = oldsn; } + L120: h = d(m) * cs; d(m) = h * oldcs; e(m - 1) = h * oldsn; - // - // ----------------------- - // update singular vectors - // ----------------------- + // update singular vectors if(use_lasr_gpu_nocopy) { CHECK_HIP(hipStreamSynchronize(stream)); if(rotate) { - // -------------- // copy rotations - // -------------- size_t const nbytes = sizeof(S) * (n - 1); hipMemcpyKind const kind = hipMemcpyHostToDevice; @@ -1566,8 +652,7 @@ static void bdsqr_single_template(rocblas_handle handle, if(ncvt > 0) { - // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( - // ll, 1 ), ldvt ) + // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(ll, 1 ), ldvt ) rocblas_side side = rocblas_side_left; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_forward_direction; @@ -1592,8 +677,7 @@ static void bdsqr_single_template(rocblas_handle handle, } if(nru > 0) { - // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 - // ), u( 1, ll ), ldu ) + // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 ), u( 1, ll ), ldu ) rocblas_side side = rocblas_side_right; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_forward_direction; @@ -1619,8 +703,7 @@ static void bdsqr_single_template(rocblas_handle handle, } if(ncc > 0) { - // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 - // ), c( ll, 1 ), ldc ) + // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 ), c( ll, 1 ), ldc ) rocblas_side side = rocblas_side_left; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_forward_direction; @@ -1644,21 +727,17 @@ static void bdsqr_single_template(rocblas_handle handle, c(ll, 1), ldc); } } - // - // test convergence - // - if(abs(e(m - 1)) <= thresh) + + // test convergence + if(std::abs(e(m - 1)) <= thresh) e(m - 1) = zero; } else { - /* - * chase bulge from bottom to top - * save cosines and sines for later singular vector updates - */ + // chase bulge from bottom to top + // save cosines and sines for later singular vector updates cs = one; oldcs = one; - // do 130 i = m, ll + 1, -1 for(i = m; i >= (ll + 1); i--) { auto di_cs = d(i) * cs; @@ -1676,23 +755,20 @@ static void bdsqr_single_template(rocblas_handle handle, work(i - ll + nm12) = oldcs; work(i - ll + nm13) = -oldsn; } + L130: h = d(ll) * cs; d(ll) = h * oldcs; e(ll) = h * oldsn; - // - // update singular vectors - // + // update singular vectors if(use_lasr_gpu_nocopy) { CHECK_HIP(hipStreamSynchronize(stream)); if(rotate) { - // -------------- // copy rotations - // -------------- size_t const nbytes = sizeof(S) * (n - 1); hipMemcpyKind const kind = hipMemcpyHostToDevice; @@ -1732,9 +808,7 @@ static void bdsqr_single_template(rocblas_handle handle, if(ncvt > 0) { - // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( - // nm13+1 - // ), vt( ll, 1 ), ldvt ); + // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( nm13+1 ), vt( ll, 1 ), ldvt ); rocblas_side side = rocblas_side_left; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_backward_direction; @@ -1760,9 +834,7 @@ static void bdsqr_single_template(rocblas_handle handle, } if(nru > 0) { - // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, - // ll - // ), ldu ) + // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, ll ), ldu ) rocblas_side side = rocblas_side_right; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_backward_direction; @@ -1787,9 +859,7 @@ static void bdsqr_single_template(rocblas_handle handle, } if(ncc > 0) { - // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, - // 1 - // ), ldc ) + // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, 1 ), ldc ) rocblas_side side = rocblas_side_left; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_backward_direction; @@ -1812,27 +882,22 @@ static void bdsqr_single_template(rocblas_handle handle, call_lasr(side, pivot, direct, mm, ncc, work(1), work(n), c(ll, 1), ldc); } } - // - // test convergence - // - if(abs(e(ll)) <= thresh) + + // test convergence + if(std::abs(e(ll)) <= thresh) e(ll) = zero; } } + + // otherwise use nonzero shift else { - // - // use nonzero shift - // if(idir == 1) { - // - // chase bulge from top to bottom - // save cosines and sines for later singular vector updates - // - f = (abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll)); + // chase bulge from top to bottom + // save cosines and sines for later singular vector updates + f = (std::abs(d(ll)) - shift) * (sign(one, d(ll)) + shift / d(ll)); g = e(ll); - // do 140 i = ll, m - 1 for(i = ll; i <= (m - 1); i++) { call_lartg(f, g, cosr, sinr, r); @@ -1856,21 +921,18 @@ static void bdsqr_single_template(rocblas_handle handle, work(i - ll + 1 + nm12) = cosl; work(i - ll + 1 + nm13) = sinl; } + L140: e(m - 1) = f; - // - // update singular vectors - // + // update singular vectors if(use_lasr_gpu_nocopy) { CHECK_HIP(hipStreamSynchronize(stream)); if(rotate) { - // -------------- // copy rotations - // -------------- size_t const nbytes = sizeof(S) * (n - 1); hipMemcpyKind const kind = hipMemcpyHostToDevice; @@ -1910,8 +972,7 @@ static void bdsqr_single_template(rocblas_handle handle, if(ncvt > 0) { - // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt( - // ll, 1 ), ldvt ) + // call_lasr( 'l', 'v', 'f', m-ll+1, ncvt, work( 1 ), work( n ), vt(ll, 1 ), ldvt ) rocblas_side side = rocblas_side_left; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_forward_direction; @@ -1937,8 +998,7 @@ static void bdsqr_single_template(rocblas_handle handle, if(nru > 0) { - // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 - // ), u( 1, ll ), ldu ) + // call_lasr( 'r', 'v', 'f', nru, m-ll+1, work( nm12+1 ), work( nm13+1 ), u( 1, ll ), ldu ) rocblas_side side = rocblas_side_right; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_forward_direction; @@ -1964,8 +1024,7 @@ static void bdsqr_single_template(rocblas_handle handle, } if(ncc > 0) { - // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 - // ), c( ll, 1 ), ldc ) + // call_lasr( 'l', 'v', 'f', m-ll+1, ncc, work( nm12+1 ), work( nm13+1 ), c( ll, 1 ), ldc ) rocblas_side side = rocblas_side_left; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_forward_direction; @@ -1989,21 +1048,17 @@ static void bdsqr_single_template(rocblas_handle handle, c(ll, 1), ldc); } } - /* - * test convergence - */ - if(abs(e(m - 1)) <= thresh) + + // test convergence + if(std::abs(e(m - 1)) <= thresh) e(m - 1) = zero; } else { - /* - * chase bulge from bottom to top - * save cosines and sines for later singular vector updates - */ - f = (abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m)); + // chase bulge from bottom to top + // save cosines and sines for later singular vector updates + f = (std::abs(d(m)) - shift) * (sign(one, d(m)) + shift / d(m)); g = e(m - 1); - // do 150 i = m, ll + 1, -1 for(i = m; i >= (ll + 1); i--) { call_lartg(f, g, cosr, sinr, r); @@ -2027,26 +1082,22 @@ static void bdsqr_single_template(rocblas_handle handle, work(i - ll + nm12) = cosl; work(i - ll + nm13) = -sinl; } + L150: e(ll) = f; - // - // test convergence - // - if(abs(e(ll)) <= thresh) + + // test convergence + if(std::abs(e(ll)) <= thresh) e(ll) = zero; - // - // update singular vectors if desired - // + // update singular vectors if(use_lasr_gpu_nocopy) { CHECK_HIP(hipStreamSynchronize(stream)); if(rotate) { - // -------------- // copy rotations - // -------------- size_t const nbytes = sizeof(S) * (n - 1); hipMemcpyKind const kind = hipMemcpyHostToDevice; @@ -2086,9 +1137,7 @@ static void bdsqr_single_template(rocblas_handle handle, if(ncvt > 0) { - // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work( - // nm13+1 - // ), vt( ll, 1 ), ldvt ) + // call_lasr( 'l', 'v', 'b', m-ll+1, ncvt, work( nm12+1 ), work(nm13+1), vt( ll, 1 ), ldvt ) rocblas_side side = rocblas_side_left; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_backward_direction; @@ -2114,9 +1163,7 @@ static void bdsqr_single_template(rocblas_handle handle, } if(nru > 0) { - // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, - // ll - // ), ldu ) + // call_lasr( 'r', 'v', 'b', nru, m-ll+1, work( 1 ), work( n ), u( 1, ll ), ldu ) rocblas_side side = rocblas_side_right; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_backward_direction; @@ -2141,9 +1188,7 @@ static void bdsqr_single_template(rocblas_handle handle, } if(ncc > 0) { - // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, - // 1 - // ), ldc ) + // call_lasr( 'l', 'v', 'b', m-ll+1, ncc, work( 1 ), work( n ), c( ll, 1 ), ldc ) rocblas_side side = rocblas_side_left; rocblas_pivot pivot = rocblas_pivot_variable; rocblas_direct direct = rocblas_backward_direction; @@ -2169,24 +1214,17 @@ static void bdsqr_single_template(rocblas_handle handle, } } CHECK_HIP(hipStreamSynchronize(stream)); - /* - * qr iteration finished, go back and check convergence - */ + + // qr iteration finished, go back and check convergence goto L60; -/* - * all singular values converged, so make them positive - */ L160: - // do 170 i = 1, n + // all singular values converged, so make them positive for(i = 1; i <= n; i++) { if(d(i) < zero) { d(i) = -d(i); - // - // change sign of singular vectors, if desired - // if(ncvt > 0) { if(use_gpu) @@ -2200,22 +1238,17 @@ static void bdsqr_single_template(rocblas_handle handle, } } } + L170: - // - // sort the singular values into decreasing order (insertion sort on - // singular values, but only one transposition per singular vector) - // - // do 190 i = 1, n - 1 + // sort the singular values into decreasing order (insertion sort on + // singular values, but only one transposition per singular vector) if(need_sort) { for(i = 1; i <= (n - 1); i++) { - // - // scan for smallest d(i) - // + // scan for smallest d(i) isub = 1; smin = d(1); - // do 180 j = 2, n + 1 - i for(j = 2; j <= (n + 1 - i); j++) { if(d(j) <= smin) @@ -2227,9 +1260,7 @@ static void bdsqr_single_template(rocblas_handle handle, L180: if(isub != n + 1 - i) { - // - // swap singular values and vectors - // + // swap singular values and vectors d(isub) = d(n + 1 - i); d(n + 1 - i) = smin; if(ncvt > 0) @@ -2268,25 +1299,22 @@ static void bdsqr_single_template(rocblas_handle handle, } } } + L190: goto L220; -// -// maximum number of iterations exceeded, failure to converge -// + +// maximum number of iterations exceeded, failure to converge L200: info = 0; - // do 210 i = 1, n - 1 for(i = 1; i <= (n - 1); i++) { if(e(i) != zero) info = info + 1; } + L210: L220: return; - // - // end of dbdsqr - // } template @@ -2317,17 +1345,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, I* splits_map, S* work) { - // ------------------------- - // copy D into hD, E into hE - // ------------------------- - - hipStream_t stream; - rocblas_get_stream(handle, &stream); - - W1 V = V_arg; - W2 U = U_arg; - W3 C = C_arg; - + // ------------------------------- + // lambda expression as helper + // ------------------------------- auto is_device_pointer = [](void* ptr) -> bool { hipPointerAttribute_t dev_attributes; if(ptr == nullptr) @@ -2345,9 +1365,17 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, return (dev_attributes.type == hipMemoryTypeDevice); }; - // --------------------------------------------------- + // ------------------------- + // copy D into hD, E into hE + // ------------------------- + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + W1 V = V_arg; + W2 U = U_arg; + W3 C = C_arg; + // handle batch case with array of pointers on device - // --------------------------------------------------- std::vector Vp_array(batch_count); std::vector Up_array(batch_count); std::vector Cp_array(batch_count); @@ -2357,9 +1385,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, bool const is_device_V_arg = is_device_pointer((void*)V_arg); if(is_device_V_arg) { - // ------------------------------------------------------------ // note "T *" and "T * const" may be considered different types - // ------------------------------------------------------------ bool constexpr is_array_of_device_pointers = !(std::is_same::value || std::is_same::value); bool constexpr need_copy_W1 = is_array_of_device_pointers; @@ -2442,7 +1468,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, // ------------------------------------------------- // transfer arrays D(:) and E(:) from Device to Host // ------------------------------------------------- - bool const use_single_copy_for_D = (batch_count == 1) || (strideD == n); if(use_single_copy_for_D) { @@ -2476,6 +1501,9 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } + // ------------------------------------------------- + // Execute for each instance in the batch + // ------------------------------------------------- HIP_CHECK(hipStreamSynchronize(stream)); S* dwork_ = nullptr; @@ -2486,9 +1514,7 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, if(linfo_array[bid] != 0) { continue; - }; - - // std::vector hwork(4 * n); + } char uplo = (uplo_in == rocblas_fill_lower) ? 'L' : 'U'; S* d_ = &(hD[bid * n]); @@ -2498,19 +1524,15 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, T* u_ = (nu > 0) ? load_ptr_batch(U, bid, shiftU, strideU) : nullptr; T* c_ = (nc > 0) ? load_ptr_batch(C, bid, shiftC, strideC) : nullptr; S* work_ = &(hwork[0]); - // S* dwork = &(work[bid * (4 * n)]); I info = 0; I nru = nu; I ncc = nc; - // ------------------------------------------------------- // NOTE: lapack dbdsqr() accepts "VT" and "ldvt" for transpose of V - // as input variable - // However, rocsolver bdsqr() accepts variable called "V" and "ldv" - // but may be actually holding "VT" - // ------------------------------------------------------- + // as input variable however, rocsolver bdsqr() accepts variable called "V" and "ldv" + // but it is actually holding "VT" T* vt_ = v_; I ldvt = ldv; @@ -2523,10 +1545,8 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, if(info == 0) { - // ---------------------------- // explicitly zero out "E" array // to be compatible with rocsolver bdsqr - // ---------------------------- S const zero = S(0); for(I i = 0; i < (n - 1); i++) { @@ -2543,7 +1563,6 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, // ------------------------------------------------- // transfer arrays D(:) and E(:) from host to device // ------------------------------------------------- - if(use_single_copy_for_D) { void* const src = (void*)&(hD[0]); @@ -2576,11 +1595,10 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, } } + // ------------------------------------------------------ + // copy linfo_array[] from host to info_array[] on device + // ------------------------------------------------------ { - // ------------------------------------------------------ - // copy linfo_array[] from host to info_array[] on device - // ------------------------------------------------------ - void* const src = (void*)&(linfo_array[0]); void* const dst = (void*)&(info_array[0]); size_t const nbytes = sizeof(I) * batch_count; @@ -2594,19 +1612,14 @@ rocblas_status rocsolver_bdsqr_host_batch_template(rocblas_handle handle, // ---------------------- // free allocated storage // ---------------------- - HIP_CHECK(hipHostFree(hwork)); hwork = nullptr; - HIP_CHECK(hipHostFree(hD)); hD = nullptr; - HIP_CHECK(hipHostFree(hE)); hE = nullptr; - HIP_CHECK(hipFree(dwork_)); dwork_ = nullptr; - HIP_CHECK(hipHostFree(linfo_array)); linfo_array = nullptr; diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp index fe0979246..ca388ff11 100644 --- a/library/src/include/lapack_device_functions.hpp +++ b/library/src/include/lapack_device_functions.hpp @@ -1111,4 +1111,96 @@ ROCSOLVER_KERNEL void axpy_kernel(const rocblas_int n, } } +template +__global__ static void + rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s) +{ + if(n <= 0) + return; + + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; + + if((incx == 1) && (incy == 1)) + { + // ------------ + // special case + // ------------ + for(I i = i_start; i < n; i += i_inc) + { + auto const temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + } + } + else + { + // --------------------------- + // code for unequal increments + // --------------------------- + + for(auto i = i_start; i < n; i += i_inc) + { + auto const ix = 0 + i * static_cast(incx); + auto const iy = 0 + i * static_cast(incy); + auto const temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + } + } +} + +template +static void + rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((rot_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, x, incx, y, incy, c, s); +} + +template +__global__ static void scal_kernel(I const n, S const da, T* const x, I const incx) +{ + if(n <= 0) + return; + + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; + + S const zero = 0; + bool const is_da_zero = (da == zero); + if(incx == 1) + { + for(I i = i_start; i < n; i += i_inc) + { + x[i] = (is_da_zero) ? zero : da * x[i]; + } + } + else + { + // --------------------------- + // code for non-unit increments + // --------------------------- + + for(I i = i_start; i < n; i += i_inc) + { + auto const ix = 0 + i * static_cast(incx); + x[ix] = (is_da_zero) ? zero : da * x[ix]; + } + } +} + +template +static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((scal_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, da, x, incx); +} + ROCSOLVER_END_NAMESPACE diff --git a/library/src/include/lapack_host_functions.hpp b/library/src/include/lapack_host_functions.hpp new file mode 100644 index 000000000..cb10a233e --- /dev/null +++ b/library/src/include/lapack_host_functions.hpp @@ -0,0 +1,596 @@ +/* ************************************************************************** + * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * *************************************************************************/ + +#pragma once + +#include "lib_host_helpers.hpp" +#include "lib_macros.hpp" +#include "rocsolver/rocsolver.h" + +ROCSOLVER_BEGIN_NAMESPACE + +/* + * =========================================================================== + * common location for host functions that reproduce LAPACK + * and BLAS functionality. + * =========================================================================== + */ + +static void call_lamch(char& cmach, double& eps) +{ + eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() + : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() + : ((cmach == 'B') || (cmach == 'b')) ? FLT_RADIX + : std::numeric_limits::min(); +} + +static void call_lamch(char& cmach, float& eps) +{ + eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() + : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() + : ((cmach == 'B') || (cmach == 'b')) ? FLT_RADIX + : std::numeric_limits::min(); +} + +template +static void call_las2(T& f, T& g, T& h, T& ssmin, T& ssmax) +{ + T const zero = 0; + T const one = 1; + T const two = 2; + + T as, at, au, c, fa, fhmn, fhmx, ga, ha; + + auto square = [](auto x) { return (x * x); }; + + fa = std::abs(f); + ga = std::abs(g); + ha = std::abs(h); + fhmn = std::min(fa, ha); + fhmx = std::max(fa, ha); + if(fhmn == zero) + { + ssmin = zero; + if(fhmx == zero) + { + ssmax = ga; + } + else + { + // ssmax = max( fhmx, ga )*sqrt( one+ ( min( fhmx, ga ) / max( fhmx, ga ) )**2 ); + ssmax = std::max(fhmx, ga) + * std::sqrt(one + square(std::min(fhmx, ga) / std::max(fhmx, ga))); + } + } + else + { + if(ga < fhmx) + { + as = one + fhmn / fhmx; + at = (fhmx - fhmn) / fhmx; + au = square(ga / fhmx); + c = two / (std::sqrt(as * as + au) + std::sqrt(at * at + au)); + ssmin = fhmn * c; + ssmax = fhmx / c; + } + else + { + au = fhmx / ga; + if(au == zero) + { + // + // avoid possible harmful underflow if exponent range + // asymmetric (true ssmin may not underflow even if + // au underflows) + // + ssmin = (fhmn * fhmx) / ga; + ssmax = ga; + } + else + { + as = one + fhmn / fhmx; + at = (fhmx - fhmn) / fhmx; + // c = one / ( sqrt( one+( as*au )**2 )+ sqrt( one+( at*au )**2 ) ); + c = one / (std::sqrt(one + square(as * au)) + std::sqrt(one + square(at * au))); + ssmin = (fhmn * c) * au; + ssmin = ssmin + ssmin; + ssmax = ga / (c + c); + } + } + } +} + +template +static void call_lartg(T& f, T& g, S& cs, T& sn, T& r) +{ + // ------------------------------------------------------ + // lartg generates a plane rotation so that + // [ cs sn ] * [ f ] = [ r ] + // [ -sn cs ] [ g ] [ 0 ] + // + // where cs * cs + abs(sn)*abs(sn) == 1 + // if g == 0, then cs == 1, sn == 0 + // if f == 0, then cs = 0, sn is chosen so that r is real + // ------------------------------------------------------ + + auto dble = [](auto z) { return (static_cast(real_part(z))); }; + auto dimag = [](auto z) { return (static_cast(imag_part(z))); }; + auto disnan = [](auto x) -> bool { return (isnan(x)); }; + auto dcmplx = [](auto x, auto y) -> T { + bool constexpr is_complex_type + = !(std::is_same::value || std::is_same::value); + + if constexpr(is_complex_type) + { + return (T(x, y)); + } + else + { + return (T(x)); + }; + }; + auto dconjg = [&](auto z) { return (dcmplx(dble(z), -dimag(z))); }; + + auto square = [](auto x) { return (x * x); }; + + auto abs1 = [&](auto ff) { return (std::max(std::abs(dble(ff)), std::abs(dimag(ff)))); }; + auto abssq = [&](auto ff) { return (square(dble(ff)) + square(dimag(ff))); }; + + // ----------------------------------------- + // compute sqrt( x * x + y * y ) + // without unnecessary overflow or underflow + // ----------------------------------------- + auto dlapy2 = [&](auto x, auto y) { + auto const one = 1; + auto const zero = 0; + + auto ddlapy2 = x; + bool const x_is_nan = disnan(x); + bool const y_is_nan = disnan(y); + if(x_is_nan) + ddlapy2 = x; + if(y_is_nan) + ddlapy2 = y; + + if(!(x_is_nan || y_is_nan)) + { + auto const xabs = std::abs(x); + auto const yabs = std::abs(y); + auto const w = std::max(xabs, yabs); + auto const z = std::min(xabs, yabs); + if(z == zero) + { + ddlapy2 = w; + } + else + { + ddlapy2 = w * std::sqrt(one + square(z / w)); + } + } + return (ddlapy2); + }; + + char cmach = 'E'; + S const zero = 0; + S const one = 1; + S const two = 2; + T const czero = 0; + + bool has_work; + bool first; + int count, i; + S d, di, dr, eps, f2, f2s, g2, g2s, safmin; + S safmn2, safmx2, scale; + T ff, fs, gs; + + // safmin = dlamch( 's' ) + cmach = 'S'; + call_lamch(cmach, safmin); + + // eps = dlamch( 'e' ) + cmach = 'E'; + call_lamch(cmach, eps); + + // safmn2 = dlamch( 'b' )**int( log( safmin / eps ) / log( dlamch( 'b' ) ) / two ) + cmach = 'B'; + S radix = 2; + call_lamch(cmach, radix); + + int const npow = (std::log(safmin / eps) / std::log(radix) / two); + safmn2 = std::pow(radix, npow); + safmx2 = one / safmn2; + scale = std::max(abs1(f), abs1(g)); + fs = f; + gs = g; + count = 0; + + if(scale >= safmx2) + { + do + { + count = count + 1; + fs = fs * safmn2; + gs = gs * safmn2; + scale = scale * safmn2; + has_work = ((scale >= safmx2) && (count < 20)); + } while(has_work); + } + else + { + if(scale <= safmn2) + { + if((g == czero) || disnan(std::abs(g))) + { + cs = one; + sn = czero; + r = f; + return; + } + do + { + count = count - 1; + fs = fs * safmx2; + gs = gs * safmx2; + scale = scale * safmx2; + has_work = (scale <= safmn2); + } while(has_work); + } + f2 = abssq(fs); + g2 = abssq(gs); + if(f2 <= std::max(g2, one) * safmin) + { + // + // this is a rare case: f is very small. + // + if(f == czero) + { + cs = zero; + r = dlapy2(dble(g), dimag(g)); + // do complex/real division explicitly with two real divisions + d = dlapy2(dble(gs), dimag(gs)); + sn = dcmplx(dble(gs) / d, -dimag(gs) / d); + return; + } + f2s = dlapy2(dble(fs), dimag(fs)); + // g2 and g2s are accurate + // g2 is at least safmin, and g2s is at least safmn2 + g2s = std::sqrt(g2); + // error in cs from underflow in f2s is at most + // unfl / safmn2 < sqrt(unfl*eps) .lt. eps + // if max(g2,one)=g2, then f2 < g2*safmin, + // and so cs < sqrt(safmin) + // if max(g2,one)=one, then f2 < safmin + // and so cs < sqrt(safmin)/safmn2 = sqrt(eps) + // therefore, cs = f2s/g2s / sqrt( 1 + (f2s/g2s)**2 ) = f2s/g2s + cs = f2s / g2s; + // make sure abs(ff) = 1 + // do complex/real division explicitly with 2 real divisions + if(abs1(f) > one) + { + d = dlapy2(dble(f), dimag(f)); + ff = dcmplx(dble(f) / d, dimag(f) / d); + } + else + { + dr = safmx2 * dble(f); + di = safmx2 * dimag(f); + d = dlapy2(dr, di); + ff = dcmplx(dr / d, di / d); + } + sn = ff * dcmplx(dble(gs) / g2s, -dimag(gs) / g2s); + r = cs * f + sn * g; + } + else + { + // + // this is the most common case. + // neither f2 nor f2/g2 are less than safmin + // f2s cannot overflow, and it is accurate + // + f2s = std::sqrt(one + g2 / f2); + // do the f2s(real)*fs(complex) multiply with two real multiplies + r = dcmplx(f2s * dble(fs), f2s * dimag(fs)); + cs = one / f2s; + d = f2 + g2; + // do complex/real division explicitly with two real divisions + sn = dcmplx(dble(r) / d, dimag(r) / d); + sn = sn * dconjg(gs); + if(count != 0) + { + if(count > 0) + { + for(i = 1; i <= count; i++) + { + r = r * safmx2; + }; + } + else + { + for(i = 1; i <= -count; i++) + { + r = r * safmn2; + } + } + } + } + } +} + +template +static void call_scal(I& n, S& a, T& x_in, I& incx) +{ + bool const is_zero = (a == 0); + T* const x = &x_in; + for(I i = 0; i < n; i++) + { + auto const ip = i * incx; + if(is_zero) + { + x[ip] = 0; + } + else + { + x[ip] *= a; + } + }; +} + +template +static void call_rot(I& n, T& x_in, I& incx, T& y_in, I& incy, S& c, S& s) +{ + T* const x = &(x_in); + T* const y = &(y_in); + + for(I i = 0; i < n; i++) + { + auto const ix = i * incx; + auto const iy = i * incy; + + auto const temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + } +} + +// -------------------------------------------------------- +// lasv2 computes the singular value decomposition of a 2 x 2 +// triangular matrix +// [ F G ] +// [ 0 H ] +// +// on return, +// abs(ssmax) is the larger singular value, +// abs(ssmin) is the smaller singular value, +// (csl,snl) and (csr,snr) are the left and right +// singular vectors for abs(ssmax) +// +// [ csl snl] [ F G ] [ csr -snr] = [ ssmax 0 ] +// [-snl csl] [ 0 H ] [ snr csr] [ 0 ssmin ] +// -------------------------------------------------------- +template +static void call_lasv2(T& f, T& g, T& h, T& ssmin, T& ssmax, T& snr, T& csr, T& snl, T& csl) +{ + T const zero = 0; + T const one = 1; + T const two = 2; + T const four = 4; + T const half = one / two; + + bool gasmal; + bool swap; + int pmax; + char cmach; + + T a, clt, crt, d, fa, ft, ga, gt, ha, ht, l, m; + T mm, r, s, slt, srt, t, temp, tsign, tt; + T macheps; + + auto sign = [](auto a, auto b) { + auto const abs_a = std::abs(a); + return ((b >= 0) ? abs_a : -abs_a); + }; + + ft = f; + fa = std::abs(ft); + ht = h; + ha = std::abs(h); + // + // pmax points to the maximum absolute element of matrix + // pmax = 1 if f largest in absolute values + // pmax = 2 if g largest in absolute values + // pmax = 3 if h largest in absolute values + // + pmax = 1; + swap = (ha > fa); + if(swap) + { + pmax = 3; + temp = ft; + ft = ht; + ht = temp; + temp = fa; + fa = ha; + ha = temp; + // + // now fa >= ha + // + } + gt = g; + ga = std::abs(gt); + if(ga == zero) + { + // + // diagonal matrix + // + ssmin = ha; + ssmax = fa; + clt = one; + crt = one; + slt = zero; + srt = zero; + } + else + { + gasmal = true; + if(ga > fa) + { + pmax = 2; + + cmach = 'E'; + call_lamch(cmach, macheps); + + if((fa / ga) < macheps) + { + // + // case of very large ga + // + gasmal = false; + ssmax = ga; + if(ha > one) + { + ssmin = fa / (ga / ha); + } + else + { + ssmin = (fa / ga) * ha; + } + clt = one; + slt = ht / gt; + srt = one; + crt = ft / gt; + } + } + if(gasmal) + { + // + // normal case + // + d = fa - ha; + if(d == fa) + { + // + // copes with infinite f or h + // + l = one; + } + else + { + l = d / fa; + } + // + // note that 0 <= l <= 1 + // + m = gt / ft; + // + // note that abs(m) <= 1/macheps + // + t = two - l; + // + // note that t >= 1 + // + mm = m * m; + tt = t * t; + s = std::sqrt(tt + mm); + // + // note that 1 <= s <= 1 + 1/macheps + // + if(l == zero) + { + r = std::abs(m); + } + else + { + r = std::sqrt(l * l + mm); + } + // + // note that 0 <= r .le. 1 + 1/macheps + // + a = half * (s + r); + // + // note that 1 <= a .le. 1 + abs(m) + // + ssmin = ha / a; + ssmax = fa * a; + if(mm == zero) + { + // + // note that m is very tiny + // + if(l == zero) + { + t = sign(two, ft) * sign(one, gt); + } + else + { + t = gt / sign(d, ft) + m / t; + } + } + else + { + t = (m / (s + t) + m / (r + l)) * (one + a); + } + l = std::sqrt(t * t + four); + crt = two / l; + srt = t / l; + clt = (crt + srt * m) / a; + slt = (ht / ft) * srt / a; + } + } + if(swap) + { + csl = srt; + snl = crt; + csr = slt; + snr = clt; + } + else + { + csl = clt; + snl = slt; + csr = crt; + snr = srt; + } + // + // correct signs of ssmax and ssmin + // + if(pmax == 1) + { + tsign = sign(one, csr) * sign(one, csl) * sign(one, f); + } + if(pmax == 2) + { + tsign = sign(one, snr) * sign(one, csl) * sign(one, g); + } + if(pmax == 3) + { + tsign = sign(one, snr) * sign(one, snl) * sign(one, h); + } + ssmax = sign(ssmax, tsign); + ssmin = sign(ssmin, tsign * sign(one, f) * sign(one, h)); +} + +ROCSOLVER_END_NAMESPACE diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp index 74f82a15f..609097549 100644 --- a/library/src/include/lib_device_helpers.hpp +++ b/library/src/include/lib_device_helpers.hpp @@ -1237,4 +1237,53 @@ __device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev #endif } +template +__global__ static void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy) +{ + if(n <= 0) + return; + + I const i_start = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + I const i_inc = hipBlockDim_x * hipGridDim_x; + + if((incx == 1) && (incy == 1)) + { + // ------------ + // special case + // ------------ + for(I i = i_start; i < n; i += i_inc) + { + auto const temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + else + { + // --------------------------- + // code for unequal increments + // --------------------------- + + for(I i = i_start; i < n; i += i_inc) + { + auto const ix = 0 + i * static_cast(incx); + auto const iy = 0 + i * static_cast(incy); + + auto const temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + } + } +} + +template +static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((swap_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, x, incx, y, incy); +} + ROCSOLVER_END_NAMESPACE diff --git a/library/src/include/lib_host_helpers.hpp b/library/src/include/lib_host_helpers.hpp index bca4aae41..a11bf3bc2 100644 --- a/library/src/include/lib_host_helpers.hpp +++ b/library/src/include/lib_host_helpers.hpp @@ -92,6 +92,74 @@ I get_index(I* intervals, I max, I dim) return i; } +template +static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy) +{ + T* const x = &(x_in); + T* const y = &(y_in); + for(I i = 0; i < n; i++) + { + I const ix = i * incx; + I const iy = i * incy; + + T const temp = x[ix]; + x[ix] = y[iy]; + y[iy] = temp; + } +} + +static float real_part(float z) +{ + return (z); +}; +static float real_part(std::complex z) +{ + return (z.real()); +}; +static float real_part(rocblas_complex_num z) +{ + return (z.real()); +}; + +static double real_part(double z) +{ + return (z); +}; +static double real_part(std::complex z) +{ + return (z.real()); +}; +static double real_part(rocblas_complex_num z) +{ + return (z.real()); +}; + +static float imag_part(float z) +{ + return (0); +}; +static float imag_part(std::complex z) +{ + return (z.imag()); +}; +static float imag_part(rocblas_complex_num z) +{ + return (z.imag()); +}; + +static double imag_part(double z) +{ + return (0); +}; +static double imag_part(std::complex z) +{ + return (z.imag()); +}; +static double imag_part(rocblas_complex_num z) +{ + return (z.imag()); +}; + #ifdef ROCSOLVER_VERIFY_ASSUMPTIONS // Ensure __assert_fail is declared. #if !__is_identifier(__assert_fail) @@ -153,4 +221,12 @@ extern "C" [[noreturn]] void __assert_fail(const char* assertion, #define ROCSOLVER_ASSUME_X(invariant, msg) __builtin_assume(invariant) #endif +#ifndef CHECK_HIP +#define CHECK_HIP(fcn) \ + { \ + hipError_t const istat = (fcn); \ + assert(istat == hipSuccess); \ + } +#endif + ROCSOLVER_END_NAMESPACE From ae7688dad135c45487a3f7dedc670406fa5ac257 Mon Sep 17 00:00:00 2001 From: Troy Alderson <58866654+tfalders@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:56:10 -0700 Subject: [PATCH 32/35] Use ROCSOLVER_KERNEL keyword --- library/src/include/lapack_device_functions.hpp | 4 ++-- library/src/include/lib_device_helpers.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp index ca388ff11..693581293 100644 --- a/library/src/include/lapack_device_functions.hpp +++ b/library/src/include/lapack_device_functions.hpp @@ -1112,7 +1112,7 @@ ROCSOLVER_KERNEL void axpy_kernel(const rocblas_int n, } template -__global__ static void +ROCSOLVER_KERNEL void rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s) { if(n <= 0) @@ -1162,7 +1162,7 @@ static void } template -__global__ static void scal_kernel(I const n, S const da, T* const x, I const incx) +ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const incx) { if(n <= 0) return; diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp index 609097549..83622b8c6 100644 --- a/library/src/include/lib_device_helpers.hpp +++ b/library/src/include/lib_device_helpers.hpp @@ -1238,7 +1238,7 @@ __device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev } template -__global__ static void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy) +ROCSOLVER_KERNEL void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy) { if(n <= 0) return; From 859bb2f7028660219b72d8630ca24c5fe837a62a Mon Sep 17 00:00:00 2001 From: jzuniga-amd Date: Fri, 15 Nov 2024 13:26:51 -0700 Subject: [PATCH 33/35] move kernel launchers --- .../auxiliary/rocauxiliary_bdsqr_hybrid.hpp | 37 +++++++++++++++++++ .../src/include/lapack_device_functions.hpp | 27 +++----------- library/src/include/lib_device_helpers.hpp | 13 ++----- 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp index 56870cb44..77c86542d 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp @@ -38,6 +38,43 @@ ROCSOLVER_BEGIN_NAMESPACE +/************************************************************************************/ +/***************** Kernel launchers *************************************************/ +/************************************************************************************/ + +template +static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((swap_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, x, incx, y, incy); +} + +template +static void + rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((rot_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, x, incx, y, incy, c, s); +} + +template +static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream) +{ + auto nthreads = warpSize * 2; + auto nblocks = (n - 1) / nthreads + 1; + + hipLaunchKernelGGL((scal_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, + n, da, x, incx); +} + +/** Call to lasr functionality. + lasr_body can be executed as a host or device function **/ template static void call_lasr(rocblas_side& side, rocblas_pivot& pivot, diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp index 693581293..d10f68fa6 100644 --- a/library/src/include/lapack_device_functions.hpp +++ b/library/src/include/lapack_device_functions.hpp @@ -1111,6 +1111,9 @@ ROCSOLVER_KERNEL void axpy_kernel(const rocblas_int n, } } +/** ROT applies a Givens rotation between to vector x y of dimension n. + Launch this kernel with a desired number of threads organized in + NG groups in the x direction with NT threads in the x direction. **/ template ROCSOLVER_KERNEL void rot_kernel(I const n, T* const x, I const incx, T* const y, I const incy, S const c, S const s) @@ -1150,17 +1153,9 @@ ROCSOLVER_KERNEL void } } -template -static void - rot_template(I const n, T* x, I const incx, T* y, I const incy, S const c, S const s, hipStream_t stream) -{ - auto nthreads = warpSize * 2; - auto nblocks = (n - 1) / nthreads + 1; - - hipLaunchKernelGGL((rot_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, - n, x, incx, y, incy, c, s); -} - +/** SCAL scales a vector x of dimension n by a factor da. + Launch this kernel with a desired number of threads organized in + NG groups in the x direction with NT threads in the x direction. **/ template ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const incx) { @@ -1193,14 +1188,4 @@ ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const inc } } -template -static void scal_template(I const n, S const da, T* const x, I const incx, hipStream_t stream) -{ - auto nthreads = warpSize * 2; - auto nblocks = (n - 1) / nthreads + 1; - - hipLaunchKernelGGL((scal_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, - n, da, x, incx); -} - ROCSOLVER_END_NAMESPACE diff --git a/library/src/include/lib_device_helpers.hpp b/library/src/include/lib_device_helpers.hpp index 83622b8c6..dc011f03a 100644 --- a/library/src/include/lib_device_helpers.hpp +++ b/library/src/include/lib_device_helpers.hpp @@ -1237,6 +1237,9 @@ __device__ static void permute_swap(const I n, T* C, I ldc, I* map, const I nev #endif } +/** SWAP swaps the values of vectors x and y of dimension n. + Launch this kernel with a desired number of threads organized in + NG groups in the x direction with NT threads in the x direction. **/ template ROCSOLVER_KERNEL void swap_kernel(I const n, T* const x, I const incx, T* const y, I const incy) { @@ -1276,14 +1279,4 @@ ROCSOLVER_KERNEL void swap_kernel(I const n, T* const x, I const incx, T* const } } -template -static void swap_template(I const n, T* x, I const incx, T* y, I const incy, hipStream_t stream) -{ - auto nthreads = warpSize * 2; - auto nblocks = (n - 1) / nthreads + 1; - - hipLaunchKernelGGL((swap_kernel), dim3(nblocks, 1, 1), dim3(nthreads, 1, 1), 0, stream, - n, x, incx, y, incy); -} - ROCSOLVER_END_NAMESPACE From 30696b173bcd1f2dadb9c54a83a3492723e7b769 Mon Sep 17 00:00:00 2001 From: Troy Alderson <58866654+tfalders@users.noreply.github.com> Date: Mon, 18 Nov 2024 16:02:49 -0700 Subject: [PATCH 34/35] Addressed review comments --- .../include/rocsolver/rocsolver-functions.h | 16 +++---- .../src/include/lapack_device_functions.hpp | 10 ++-- library/src/include/lapack_host_functions.hpp | 46 ++++++++++--------- library/src/include/lib_host_helpers.hpp | 28 +++++------ 4 files changed, 52 insertions(+), 48 deletions(-) diff --git a/library/include/rocsolver/rocsolver-functions.h b/library/include/rocsolver/rocsolver-functions.h index 3f340ea70..bd9a0592d 100644 --- a/library/include/rocsolver/rocsolver-functions.h +++ b/library/include/rocsolver/rocsolver-functions.h @@ -3871,8 +3871,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zunmtr(rocblas_handle handle, rocblas_handle. \note - A hybrid (CPU+GPU) approach is available for BDSQR. Use \ref rocsolver_set_alg_mode - to enable it. + A hybrid (CPU+GPU) approach is available for BDSQR, primarily intended for homogeneous architectures. + Use \ref rocsolver_set_alg_mode to enable it. @param[in] handle rocblas_handle. @@ -12613,8 +12613,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zpotri_strided_batched(rocblas_handle within the rocblas_handle. \note - A hybrid (CPU+GPU) approach is available for GESVD. Use \ref rocsolver_set_alg_mode - to enable it. + A hybrid (CPU+GPU) approach is available for GESVD, primarily intended for homogeneous architectures. + Use \ref rocsolver_set_alg_mode to enable it. @param[in] handle rocblas_handle. @@ -12794,8 +12794,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd(rocblas_handle handle, within the rocblas_handle. \note - A hybrid (CPU+GPU) approach is available for GESVD_BATCHED. Use \ref rocsolver_set_alg_mode - to enable it. + A hybrid (CPU+GPU) approach is available for GESVD_BATCHED, primarily intended for + homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it. @param[in] handle rocblas_handle. @@ -13016,8 +13016,8 @@ ROCSOLVER_EXPORT rocblas_status rocsolver_zgesvd_batched(rocblas_handle handle, within the rocblas_handle. \note - A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED. Use \ref rocsolver_set_alg_mode - to enable it. + A hybrid (CPU+GPU) approach is available for GESVD_STRIDED_BATCHED, primarily intended + for homogeneous architectures. Use \ref rocsolver_set_alg_mode to enable it. @param[in] handle rocblas_handle. diff --git a/library/src/include/lapack_device_functions.hpp b/library/src/include/lapack_device_functions.hpp index d10f68fa6..6e4760c44 100644 --- a/library/src/include/lapack_device_functions.hpp +++ b/library/src/include/lapack_device_functions.hpp @@ -1144,8 +1144,8 @@ ROCSOLVER_KERNEL void for(auto i = i_start; i < n; i += i_inc) { - auto const ix = 0 + i * static_cast(incx); - auto const iy = 0 + i * static_cast(incy); + auto const ix = i * static_cast(incx); + auto const iy = i * static_cast(incy); auto const temp = c * x[ix] + s * y[iy]; y[iy] = c * y[iy] - s * x[ix]; x[ix] = temp; @@ -1171,7 +1171,7 @@ ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const inc { for(I i = i_start; i < n; i += i_inc) { - x[i] = (is_da_zero) ? zero : da * x[i]; + x[i] = da * x[i]; } } else @@ -1182,8 +1182,8 @@ ROCSOLVER_KERNEL void scal_kernel(I const n, S const da, T* const x, I const inc for(I i = i_start; i < n; i += i_inc) { - auto const ix = 0 + i * static_cast(incx); - x[ix] = (is_da_zero) ? zero : da * x[ix]; + auto const ix = i * static_cast(incx); + x[ix] = da * x[ix]; } } } diff --git a/library/src/include/lapack_host_functions.hpp b/library/src/include/lapack_host_functions.hpp index cb10a233e..c120151ce 100644 --- a/library/src/include/lapack_host_functions.hpp +++ b/library/src/include/lapack_host_functions.hpp @@ -40,20 +40,32 @@ ROCSOLVER_BEGIN_NAMESPACE * =========================================================================== */ -static void call_lamch(char& cmach, double& eps) +static void call_lamch(char& cmach, float& eps) { - eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() - : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() - : ((cmach == 'B') || (cmach == 'b')) ? FLT_RADIX - : std::numeric_limits::min(); + switch(cmach) + { + case 'E': + case 'e': eps = std::numeric_limits::epsilon(); return; + case 'S': + case 's': eps = std::numeric_limits::min(); return; + case 'B': + case 'b': eps = FLT_RADIX; return; + default: eps = std::numeric_limits::min(); + } } -static void call_lamch(char& cmach, float& eps) +static void call_lamch(char& cmach, double& eps) { - eps = ((cmach == 'E') || (cmach == 'e')) ? std::numeric_limits::epsilon() - : ((cmach == 'S') || (cmach == 's')) ? std::numeric_limits::min() - : ((cmach == 'B') || (cmach == 'b')) ? FLT_RADIX - : std::numeric_limits::min(); + switch(cmach) + { + case 'E': + case 'e': eps = std::numeric_limits::epsilon(); return; + case 'S': + case 's': eps = std::numeric_limits::min(); return; + case 'B': + case 'b': eps = FLT_RADIX; return; + default: eps = std::numeric_limits::min(); + } } template @@ -141,8 +153,7 @@ static void call_lartg(T& f, T& g, S& cs, T& sn, T& r) auto dimag = [](auto z) { return (static_cast(imag_part(z))); }; auto disnan = [](auto x) -> bool { return (isnan(x)); }; auto dcmplx = [](auto x, auto y) -> T { - bool constexpr is_complex_type - = !(std::is_same::value || std::is_same::value); + bool constexpr is_complex_type = rocblas_is_complex; if constexpr(is_complex_type) { @@ -348,15 +359,8 @@ static void call_scal(I& n, S& a, T& x_in, I& incx) for(I i = 0; i < n; i++) { auto const ip = i * incx; - if(is_zero) - { - x[ip] = 0; - } - else - { - x[ip] *= a; - } - }; + x[ip] *= a; + } } template diff --git a/library/src/include/lib_host_helpers.hpp b/library/src/include/lib_host_helpers.hpp index a11bf3bc2..8dfad5014 100644 --- a/library/src/include/lib_host_helpers.hpp +++ b/library/src/include/lib_host_helpers.hpp @@ -99,8 +99,8 @@ static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy) T* const y = &(y_in); for(I i = 0; i < n; i++) { - I const ix = i * incx; - I const iy = i * incy; + auto const ix = i * static_cast(incx); + auto const iy = i * static_cast(incy); T const temp = x[ix]; x[ix] = y[iy]; @@ -111,54 +111,54 @@ static void call_swap(I& n, T& x_in, I& incx, T& y_in, I& incy) static float real_part(float z) { return (z); -}; +} static float real_part(std::complex z) { return (z.real()); -}; +} static float real_part(rocblas_complex_num z) { return (z.real()); -}; +} static double real_part(double z) { return (z); -}; +} static double real_part(std::complex z) { return (z.real()); -}; +} static double real_part(rocblas_complex_num z) { return (z.real()); -}; +} static float imag_part(float z) { return (0); -}; +} static float imag_part(std::complex z) { return (z.imag()); -}; +} static float imag_part(rocblas_complex_num z) { return (z.imag()); -}; +} static double imag_part(double z) { return (0); -}; +} static double imag_part(std::complex z) { return (z.imag()); -}; +} static double imag_part(rocblas_complex_num z) { return (z.imag()); -}; +} #ifdef ROCSOLVER_VERIFY_ASSUMPTIONS // Ensure __assert_fail is declared. From a08ca4f5c6924c6af263b92312478ae521c74b83 Mon Sep 17 00:00:00 2001 From: Troy Alderson <58866654+tfalders@users.noreply.github.com> Date: Mon, 18 Nov 2024 16:12:18 -0700 Subject: [PATCH 35/35] Addressed review comment --- library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp index 77c86542d..69584cc01 100644 --- a/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp +++ b/library/src/auxiliary/rocauxiliary_bdsqr_hybrid.hpp @@ -159,12 +159,6 @@ static void bdsqr_single_template(rocblas_handle handle, CHECK_HIP(hipStreamSynchronize(stream)); }; - auto indx2f = [](auto i, auto j, auto ld) -> int64_t { - assert((1 <= i) && (i <= ld)); - assert((1 <= j)); - return ((i - 1) + (j - 1) * int64_t(ld)); - }; - auto d = [=](auto i) -> S& { assert((1 <= i) && (i <= n)); return (d_[i - 1]); @@ -182,19 +176,19 @@ static void bdsqr_single_template(rocblas_handle handle, auto c = [=](auto i, auto j) -> T& { assert((1 <= i) && (i <= nrc) && (nrc <= ldc)); assert((1 <= j) && (j <= ncc)); - return (c_[indx2f(i, j, ldc)]); + return (c_[idx2D(i - 1, j - 1, ldc)]); }; auto u = [=](auto i, auto j) -> T& { assert((1 <= i) && (i <= nru) && (nru <= ldu)); assert((1 <= j) && (j <= ncu)); - return (u_[indx2f(i, j, ldu)]); + return (u_[idx2D(i - 1, j - 1, ldu)]); }; auto vt = [=](auto i, auto j) -> T& { assert((1 <= i) && (i <= nrvt) && (nrvt <= ldvt)); assert((1 <= j) && (j <= ncvt)); - return (vt_[indx2f(i, j, ldvt)]); + return (vt_[idx2D(i - 1, j - 1, ldvt)]); }; auto sign = [](auto a, auto b) {