Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions source/source_base/module_container/ATen/kernels/cuda/lapack.cu
Original file line number Diff line number Diff line change
Expand Up @@ -325,14 +325,22 @@ struct lapack_hegvx<T, DEVICE_GPU> {
const char uplo = 'U';
int meig = 0;

// this hegvdx will protect the input A, B from being overwritten
// and write the eigenvectors into eigen_vec.
// cuSOLVER hegvdx overwrites A and B on exit, so copy A to
// eigen_vec and backup B to protect the origin matrices.
CHECK_CUDA(cudaMemcpy(eigen_vec, A, sizeof(T) * n * lda, cudaMemcpyDeviceToDevice));

T* d_B_backup = nullptr;
CHECK_CUDA(cudaMalloc(&d_B_backup, sizeof(T) * n * lda));
CHECK_CUDA(cudaMemcpy(d_B_backup, B, sizeof(T) * n * lda, cudaMemcpyDeviceToDevice));

cuSolverConnector::hegvdx(cusolver_handle,
itype, jobz, range, uplo,
n, lda, A, B,
n, lda, eigen_vec, d_B_backup,
Real(0), Real(0),
1, m, &meig,
eigen_val, eigen_vec);

CHECK_CUDA(cudaFree(d_B_backup));
}
};

Expand Down
13 changes: 10 additions & 3 deletions source/source_base/module_container/ATen/kernels/lapack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ struct lapack_hegvx<T, DEVICE_CPU> {
const int itype = 1; // ITYPE = 1: A*x = (lambda)*B*x
const char jobz = 'V';// JOBZ = 'V': Compute eigenvalues and eigenvectors.
const char range = 'I'; // RANGE = 'I': the IL-th through IU-th eigenvalues will be found.
const char uplo = 'L'; // UPLO = 'L': Lower triangles of A and B are stored.
const char uplo = 'U'; // UPLO = 'U': Upper triangles of A and B are stored.

const int il = 1;
const int iu = m;
Expand All @@ -394,6 +394,13 @@ struct lapack_hegvx<T, DEVICE_CPU> {
T work_query;
Real rwork_query;

// dummy arrays for workspace query (some LAPACK implementations
// require valid pointers even during query)
const int liwork_query = 5 * n;
const int lrwork_query = 7 * n;
std::vector<int> iwork_query(liwork_query);
std::vector<int> ifail_query(n);

// set lwork = -1 to query optimal work size
lapackConnector::hegvx(
itype, jobz, range, uplo,
Expand All @@ -409,8 +416,8 @@ struct lapack_hegvx<T, DEVICE_CPU> {
&work_query, // WORK (query)
lwork,
&rwork_query, // RWORK (query)
static_cast<int*>(nullptr), // IWORK (query)
static_cast<int*>(nullptr), // IFAIL (query)
iwork_query.data(), // IWORK (query)
ifail_query.data(), // IFAIL (query)
info);

// !> If LWORK = -1, then a workspace query is assumed; the routine
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,34 @@ struct lapack_hegvd<T, DEVICE_GPU> {
}
};

template <typename T>
struct lapack_hegvx<T, DEVICE_GPU> {
using Real = typename GetTypeReal<T>::type;
void operator()(
const int n,
const int lda,
T* A,
T* B,
const int m,
Real* eigen_val,
T* eigen_vec)
{
// Fallback to CPU: copy matrices to host, call CPU lapack_hegvx, copy results back
std::vector<T> H_A(n * lda);
std::vector<T> H_B(n * lda);
std::vector<Real> H_eigen_val(n);
std::vector<T> H_eigen_vec(n * lda);

hipErrcheck(hipMemcpy(H_A.data(), A, sizeof(T) * n * lda, hipMemcpyDeviceToHost));
hipErrcheck(hipMemcpy(H_B.data(), B, sizeof(T) * n * lda, hipMemcpyDeviceToHost));

lapack_hegvx<T, DEVICE_CPU>()(n, lda, H_A.data(), H_B.data(), m, H_eigen_val.data(), H_eigen_vec.data());

hipErrcheck(hipMemcpy(eigen_val, H_eigen_val.data(), sizeof(Real) * n, hipMemcpyHostToDevice));
hipErrcheck(hipMemcpy(eigen_vec, H_eigen_vec.data(), sizeof(T) * n * lda, hipMemcpyHostToDevice));
Comment on lines +151 to +160
Copy link

Copilot AI Apr 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this ROCm GPU specialization, the host buffers and memcpy sizes for outputs should match what lapack_hegvx actually produces: eigen_val is length m (see lapack_hegvx<T, DEVICE_CPU> usage in lapack_test.cpp), and eigen_vec is lda * m (Z is n-by-m). Copying n eigenvalues and n*lda eigenvector elements back can write past the caller’s allocated buffers when they size them to m (e.g., the hegvx unit test allocates E with neig entries). Please resize H_eigen_val to m and copy back only m values; likewise allocate/copy H_eigen_vec as lda * m (or at least only memcpy that many elements).

Suggested change
std::vector<Real> H_eigen_val(n);
std::vector<T> H_eigen_vec(n * lda);
hipErrcheck(hipMemcpy(H_A.data(), A, sizeof(T) * n * lda, hipMemcpyDeviceToHost));
hipErrcheck(hipMemcpy(H_B.data(), B, sizeof(T) * n * lda, hipMemcpyDeviceToHost));
lapack_hegvx<T, DEVICE_CPU>()(n, lda, H_A.data(), H_B.data(), m, H_eigen_val.data(), H_eigen_vec.data());
hipErrcheck(hipMemcpy(eigen_val, H_eigen_val.data(), sizeof(Real) * n, hipMemcpyHostToDevice));
hipErrcheck(hipMemcpy(eigen_vec, H_eigen_vec.data(), sizeof(T) * n * lda, hipMemcpyHostToDevice));
std::vector<Real> H_eigen_val(m);
std::vector<T> H_eigen_vec(lda * m);
hipErrcheck(hipMemcpy(H_A.data(), A, sizeof(T) * n * lda, hipMemcpyDeviceToHost));
hipErrcheck(hipMemcpy(H_B.data(), B, sizeof(T) * n * lda, hipMemcpyDeviceToHost));
lapack_hegvx<T, DEVICE_CPU>()(n, lda, H_A.data(), H_B.data(), m, H_eigen_val.data(), H_eigen_vec.data());
hipErrcheck(hipMemcpy(eigen_val, H_eigen_val.data(), sizeof(Real) * m, hipMemcpyHostToDevice));
hipErrcheck(hipMemcpy(eigen_vec, H_eigen_vec.data(), sizeof(T) * lda * m, hipMemcpyHostToDevice));

Copilot uses AI. Check for mistakes.
}
};

template struct set_matrix<float, DEVICE_GPU>;
template struct set_matrix<double, DEVICE_GPU>;
template struct set_matrix<std::complex<float>, DEVICE_GPU>;
Expand All @@ -158,5 +186,10 @@ template struct lapack_hegvd<double, DEVICE_GPU>;
template struct lapack_hegvd<std::complex<float>, DEVICE_GPU>;
template struct lapack_hegvd<std::complex<double>, DEVICE_GPU>;

template struct lapack_hegvx<float, DEVICE_GPU>;
template struct lapack_hegvx<double, DEVICE_GPU>;
template struct lapack_hegvx<std::complex<float>, DEVICE_GPU>;
template struct lapack_hegvx<std::complex<double>, DEVICE_GPU>;

} // namespace kernels
} // namespace container
52 changes: 8 additions & 44 deletions source/source_hsolver/diago_dav_subspace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -673,52 +673,17 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
}
#endif
}
else
else if (this->diag_subspace == 0)
{
if (this->diag_subspace == 0)
if (this->diag_comm.rank == 0)
{
if (this->diag_comm.rank == 0)
{
std::vector<std::vector<T>> h_diag(nbase, std::vector<T>(nbase, *this->zero));
std::vector<std::vector<T>> s_diag(nbase, std::vector<T>(nbase, *this->zero));

for (size_t i = 0; i < nbase; i++)
{
for (size_t j = 0; j < nbase; j++)
{
h_diag[i][j] = hcc[i * this->nbase_x + j];
s_diag[i][j] = scc[i * this->nbase_x + j];
}
}
hegvx_op<T, Device>()(this->ctx,
nbase,
this->nbase_x,
this->hcc,
this->scc,
nband,
(*eigenvalue_iter).data(),
this->vcc);
// reset:
for (size_t i = 0; i < nbase; i++)
{
for (size_t j = 0; j < nbase; j++)
{
hcc[i * this->nbase_x + j] = h_diag[i][j];
scc[i * this->nbase_x + j] = s_diag[i][j];
}

for (size_t j = nbase; j < this->nbase_x; j++)
{
hcc[i * this->nbase_x + j] = *this->zero;
hcc[j * this->nbase_x + i] = *this->zero;
scc[i * this->nbase_x + j] = *this->zero;
scc[j * this->nbase_x + i] = *this->zero;
}
}
}
ct::kernels::lapack_hegvx<T, ct_Device>()(
nbase, this->nbase_x, this->hcc, this->scc, nband,
(*eigenvalue_iter).data(), this->vcc);
}
else
{
}
else
{
#ifdef __MPI
std::vector<T> h_diag;
std::vector<T> s_diag;
Expand Down Expand Up @@ -760,7 +725,6 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
std::cout << "Error: parallel diagonalization is not supported in serial mode." << std::endl;
exit(1);
#endif
}
}

#ifdef __MPI
Expand Down
Loading