Skip to content

Commit

Permalink
Add dsterf calling, minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
mdvizov committed Sep 1, 2022
1 parent def0e2a commit 20c9487
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 8 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
option(ROCSOLVER_EMBED_FMT "Hide libfmt symbols" OFF)
option(OPTIMAL "Build specialized kernels for small matrix sizes" ON)
option(HYBRID_CPU "Build hybrid schema with CPU using" ON)
option(LAPACK_FUNCTIONS "Build hybrid with lapack routine functions" ON)

This comment has been minimized.

Copy link
@cgmb

cgmb Sep 2, 2022

Collaborator

This has some complications when we package and distribute the rocSOLVER binary, so please default to OFF. It's a great option to have, but there's a bunch of work to do before it can be enabled by default.

option(EXPERIMENTAL "Experimental parallelization" OFF)
option(ROCSOLVER_FIND_PACKAGE_LAPACK_CONFIG "Skip module mode search for LAPACK" ON)

Expand Down
26 changes: 26 additions & 0 deletions library/src/auxiliary/rocauxiliary_sterf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,32 @@

#include "rocauxiliary_sterf.hpp"

#ifdef LAPACK_FUNCTIONS

#ifdef __cplusplus
extern "C" {
#endif
void dsterf(int* n, double* D, double* E, int* info);

This comment has been minimized.

Copy link
@EdDAzevedo

EdDAzevedo Sep 2, 2022

Contributor

Most Fortran compilers (such as gfortran) may generate external symbol "dsterf_" (with underscore) as external symbol or just "dsterf" (without underscore) (not so common, such as by IBM XL). There is also C callable LAPACKE library. Perhaps the code may use "dsterf_" as the external symbol for lapack? Just a thought.

This comment has been minimized.

Copy link
@cgmb

cgmb Sep 2, 2022

Collaborator

It may also be "DSTERF" on Windows if using the Intel Fortran compiler. However, I think for the moment we can just start by supporting the gfortran convention (with the trailing underscore, as you mentioned). We should take care not to let this useful change get bogged down by too much scope creep, especially given that it's being submitted by a first-time contributor.

Aside from missing the trailing underscore, @mdvizov has copied the conventions we're currently using in rocBLAS and rocSOLVER. Those conventions are not ideal, but I think that following them should be sufficient for this PR.

void ssterf(int* n, float* D, float* E, int* info);
#ifdef __cplusplus
}
#endif

template <>
void lapack_sterf<double>(rocblas_int n, double* D, double* E, int &info)
{
dsterf(&n, D, E, &info);
}

template <>
void lapack_sterf<float>(rocblas_int n, float* D, float* E, int &info)
{
ssterf(&n, D, E, &info);
}

#endif


template <typename T>
rocblas_status
rocsolver_sterf_impl(rocblas_handle handle, const rocblas_int n, T* D, T* E, rocblas_int* info)
Expand Down
26 changes: 18 additions & 8 deletions library/src/auxiliary/rocauxiliary_sterf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
BATCH).)
***************************************************************************/

#ifdef LAPACK_FUNCTIONS
/** direct call sterf from LAPACK **/
template <typename T>
void lapack_sterf(rocblas_int n, T* D, T* E, int &info);
#endif

/** STERF_SQ_E squares the elements of E **/
template <typename T>
__device__ void sterf_sq_e(const rocblas_int start, const rocblas_int end, T* E)
Expand Down Expand Up @@ -684,13 +690,13 @@ ROCSOLVER_KERNEL void sterf_parallelize(T* D,
{
rocblas_int m = 0;
rocblas_int count = 0, l = -1, lend = -1;
rocblas_int l0, lend0;
rocblas_int l_orig, lend_orig;
T p, anorm;

const rocblas_int tid = hipThreadIdx_x;

l0 = l = split_ranges[2 * tid];
lend0 = lend = split_ranges[2 * tid + 1];
l_orig = l = split_ranges[2 * tid];
lend_orig = lend = split_ranges[2 * tid + 1];

if(l == -1 || lend == -1)
return;
Expand All @@ -709,7 +715,7 @@ ROCSOLVER_KERNEL void sterf_parallelize(T* D,
if(abs(D[lend]) < abs(D[l]))
{
lend = l;
l = lend0;
l = lend_orig;
}

rocblas_int iters = 0;
Expand Down Expand Up @@ -875,11 +881,11 @@ ROCSOLVER_KERNEL void sterf_parallelize(T* D,
}

if(anorm > ssfmax)
scale_tridiag(l, lend, D, E, ssfmax / anorm);
scale_tridiag(l_orig, lend_orig, D, E, ssfmax / anorm);
if(anorm < ssfmin)
scale_tridiag(l, lend, D, E, ssfmin / anorm);
scale_tridiag(l_orig, lend_orig, D, E, ssfmin / anorm);

for(int i = l; i <= lend; i++)
for(int i = l_orig; i <= lend_orig; i++)
if(E[i] != 0)
info[0]++;
}
Expand Down Expand Up @@ -1010,7 +1016,7 @@ rocblas_status rocsolver_sterf_template(rocblas_handle handle,
T* h_E = new T[n];
rocblas_int h_info = 0;

hipDeviceSynchronize();
hipStreamSynchronize(stream);

T* shD = D + i * strideD + shiftD;
T* shE = E + i * strideE + shiftE;
Expand All @@ -1019,7 +1025,11 @@ rocblas_status rocsolver_sterf_template(rocblas_handle handle,
hipMemcpy(h_D, shD, sizeof(T) * n, hipMemcpyDeviceToHost);
hipMemcpy(h_E, shE, sizeof(T) * n, hipMemcpyDeviceToHost);

#ifdef LAPACK_FUNCTIONS
lapack_sterf(n, h_D, h_E, h_info);
#else
sterf_cpu<T>(n, h_D, h_E, h_info, 30 * n, eps, ssfmin, ssfmax);
#endif

hipMemcpy(shD, h_D, sizeof(T) * n, hipMemcpyHostToDevice);
hipMemcpy(shE, h_E, sizeof(T) * n, hipMemcpyHostToDevice);
Expand Down

0 comments on commit 20c9487

Please sign in to comment.