Loading src/libnptm/magma_calls.cpp +143 −9 Original line number Diff line number Diff line Loading @@ -21,7 +21,8 @@ #ifdef USE_MAGMA #include <limits> //#include <limits> #include <cstdio> #include <string> #ifndef INCLUDE_TYPES_H_ Loading Loading @@ -60,8 +61,9 @@ using namespace std; void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const RuntimeSettings& rs) { dcomplex *vec_m = mat[0]; // Vector form of the input matrix. magmaDoubleComplex *a = (magmaDoubleComplex *)mat[0]; // pointer to first element on host string message; char buffer[128]; magma_int_t err = MAGMA_SUCCESS; magma_queue_t queue = NULL; magma_device_t dev = (magma_device_t)device_id; Loading @@ -69,12 +71,12 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const Runt magma_int_t info; magma_int_t m = (magma_int_t)n; // changed rows; a - mxm matrix magma_int_t mm = m * m; // size of a magmaDoubleComplex *a = (magmaDoubleComplex *)vec_m; // pointer to first element on host const magmaDoubleComplex magma_zero = MAGMA_Z_MAKE(0.0, 0.0); const magmaDoubleComplex magma_one = MAGMA_Z_MAKE(1.0, 0.0); const magmaDoubleComplex magma_mone = MAGMA_Z_MAKE(-1.0, 0.0); const magmaDoubleComplex magma_two = MAGMA_Z_MAKE(2.0, 0.0); if (rs.invert_mode == RuntimeSettings::INV_MODE_LU) { // >>> LU INVERSION <<< magmaDoubleComplex *d_a; // pointer to first element on device magmaDoubleComplex *dwork; // work space pointer on device magma_int_t ldwork = m * magma_get_zgetri_nb(m); // optimal block size Loading @@ -95,18 +97,48 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const Runt magma_zgetrf_gpu(m, m, d_a, m, piv, &info); magma_zgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info); delete[] piv; // delete piv created by magma_zgetrf() magma_free(dwork); if (rs.use_refinement) { err = magma_newton(rs, a, m, d_a, queue); } magma_zgetmatrix(m, m, d_a , m, a, m, queue); // copy d_a -> a magma_free(d_a); magma_free(dwork); // >>> END OF LU INVERSION <<< } else if (rs.invert_mode == RuntimeSettings::INV_MODE_GESV) { // >>> GESV INVERSION <<< magmaDoubleComplex *h_id = new magmaDoubleComplex[mm](); magmaDoubleComplex *d_a, *d_id; magma_int_t *piv = new magma_int_t[m]; for (magma_int_t i = 0; i < m; i++) { h_id[i * (m + 1)] = magma_one; } err = magma_zmalloc(&d_id, mm); if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_id!\n"; rs.logger->err(message); exit(1); } magma_zsetmatrix(m, m, h_id, m, d_id , m, queue); // copy identity matrix to device delete[] h_id; // allocate matrix on device err = magma_zmalloc(&d_a, mm); // device memory for a if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_a!\n"; rs.logger->err(message); exit(1); } magma_zsetmatrix(m, m, a, m, d_a , m, queue); // copy a -> d_a magma_zgesv_gpu(m, m, d_a, m, piv, d_id, m, &info); delete[] piv; // free host memory magma_free(d_a); if (rs.use_refinement) { // refinement block printf("DEBUG: would refine GESV inversion.\n"); err = magma_newton(rs, a, m, d_id, queue); } magma_zgetmatrix(m, m, d_id , m, a, m, queue); // copy d_id -> a magma_free(d_id); // >>> END OF GESV INVERSION <<< } else if (rs.invert_mode == RuntimeSettings::INV_MODE_RBT) { // >>> RBT INVERSION <<< magmaDoubleComplex *d_a; // pointer to first element on device magma_bool_t iterate = (magma_bool_t)rs.use_refinement; magma_int_t *piv = new magma_int_t[m]; // host mem. Loading @@ -126,7 +158,106 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const Runt delete[] h_id; delete[] piv; magma_free(d_a); // >>> END OF RBT INVERSION <<< } else if (rs.invert_mode == RuntimeSettings::INV_MODE_SVD) { // >>> SVD INVERSION <<< // Step 1: compute singular value decomposition magma_int_t lwork; magmaDoubleComplex *h_work = new magmaDoubleComplex[m]; magma_zgesdd( MagmaAllVec, m, m, NULL, m, NULL, NULL, m, NULL, m, h_work, -1, NULL, NULL, &info ); if (info == 0) { lwork = (magma_int_t)(MAGMA_Z_REAL(h_work[0])); message = "DEBUG: lwork = " + to_string(lwork) + "\n"; rs.logger->log(message, LOG_DEBG); } else { message = "ERROR: magma_zgesdd failed on resource querying call!\n"; rs.logger->err(message); exit(1); } delete[] h_work; h_work = new magmaDoubleComplex[lwork]; magmaDoubleComplex *h_a, *h_U, *h_Vt; h_a = new magmaDoubleComplex[mm]; memcpy(h_a, a, mm * sizeof(magmaDoubleComplex)); h_U = new magmaDoubleComplex[mm]; h_Vt = new magmaDoubleComplex[mm]; double *h_s = new double[m]; double *h_rwork = new double[5 * mm + 7 * m]; magma_int_t *h_iwork = new magma_int_t[8 * m]; magma_zgesdd( MagmaAllVec, m, m, h_a, m, h_s, h_U, m, h_Vt, m, h_work, lwork, h_rwork, h_iwork, &info ); delete[] h_work; delete[] h_rwork; delete[] h_iwork; delete[] h_a; sprintf(buffer, "%.5le", h_s[0]); message = "DEBUG: s[0] = "; message += buffer; message += "; s["; message += to_string(m - 1); message += "] = "; sprintf(buffer, "%.5le", h_s[m - 1]); message += buffer; message += ".\n"; rs.logger->log(message, LOG_DEBG); // Step 2: Upload decomposed matix to GPU magmaDoubleComplex *d_U, *d_Vt, *d_a; err = magma_zmalloc(&d_Vt, mm); if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_Vt!\n"; rs.logger->err(message); exit(1); } magma_zsetmatrix(m, m, h_Vt, m, d_Vt, m, queue); err = magma_zmalloc(&d_U, mm); if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_U!\n"; rs.logger->err(message); exit(1); } magma_zsetmatrix(m, m, h_U, m, d_U, m, queue); err = magma_zmalloc(&d_a, mm); if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_a!\n"; rs.logger->err(message); exit(1); } // Step 3: compute threshold-truncated pseudo-inverse on GPU double eps_mach = 2.22e-18; double threshold = h_s[0] * m * eps_mach; for (magma_int_t i = 0; i < m; i++) { magmaDoubleComplex inv_s = (h_s[i] > threshold) ? MAGMA_Z_MAKE(1.0 / h_s[i], 0.0) : MAGMA_Z_MAKE(0.0, 0.0); magma_zscal(m, inv_s, d_Vt + i, m, queue); } delete[] h_s; // d_a = d_vt^H * d_u^H magmablas_zgemm( MagmaConjTrans, MagmaConjTrans, m, m, m, magma_one, d_Vt, m, d_U, m, magma_zero, d_a, m, queue ); // Step 4: release matrix decomposition magma_free(d_U); magma_free(d_Vt); delete[] h_U; delete[] h_Vt; // Step 5: refine inversion if (rs.use_refinement) { err = magma_newton(rs, a, m, d_a, queue); } // Step 6: get result back to host magma_zgetmatrix(m, m, d_a , m, a, m, queue); // copy d_a -> a magma_free(d_a); // >>> END OF SVD INVERSION <<< } magma_queue_destroy(queue); // destroy queue jer = (int)err; Loading @@ -138,6 +269,7 @@ magma_int_t magma_newton( ) { magma_int_t err; string message; char buffer[128]; const int max_ref_iters = rs.max_ref_iters; const magmaDoubleComplex magma_zero = MAGMA_Z_MAKE(0.0, 0.0); const magmaDoubleComplex magma_one = MAGMA_Z_MAKE(1.0, 0.0); Loading Loading @@ -195,7 +327,9 @@ magma_int_t magma_newton( // Compute the norm of R norm = magma_dznrm2(mm, d_axa, 1, queue); double relative_res = norm / normA; message = "INFO: refinement iteration " + to_string(ri) + " achieved " + to_string(relative_res) + ".\n"; sprintf(buffer, "%.5le", relative_res); message = "INFO: refinement iteration " + to_string(ri) + " achieved relative residual of " + buffer + ".\n"; rs.logger->log(message); if (norm < 0.99 * old_norm) { // Transform d_ax in 2*I - A*X Loading @@ -209,7 +343,7 @@ magma_int_t magma_newton( // Set d_a = X*(2*I - A*X) magma_zcopy(mm, d_axa, 1, d_a, 1, queue); old_norm = norm; if (relative_res < 1.0e-14) { if (relative_res < 1.0e-16) { message = "DEBUG: good news - optimal convergence achieved. Stopping.\n"; rs.logger->log(message, LOG_DEBG); break; // ri for Loading Loading
src/libnptm/magma_calls.cpp +143 −9 Original line number Diff line number Diff line Loading @@ -21,7 +21,8 @@ #ifdef USE_MAGMA #include <limits> //#include <limits> #include <cstdio> #include <string> #ifndef INCLUDE_TYPES_H_ Loading Loading @@ -60,8 +61,9 @@ using namespace std; void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const RuntimeSettings& rs) { dcomplex *vec_m = mat[0]; // Vector form of the input matrix. magmaDoubleComplex *a = (magmaDoubleComplex *)mat[0]; // pointer to first element on host string message; char buffer[128]; magma_int_t err = MAGMA_SUCCESS; magma_queue_t queue = NULL; magma_device_t dev = (magma_device_t)device_id; Loading @@ -69,12 +71,12 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const Runt magma_int_t info; magma_int_t m = (magma_int_t)n; // changed rows; a - mxm matrix magma_int_t mm = m * m; // size of a magmaDoubleComplex *a = (magmaDoubleComplex *)vec_m; // pointer to first element on host const magmaDoubleComplex magma_zero = MAGMA_Z_MAKE(0.0, 0.0); const magmaDoubleComplex magma_one = MAGMA_Z_MAKE(1.0, 0.0); const magmaDoubleComplex magma_mone = MAGMA_Z_MAKE(-1.0, 0.0); const magmaDoubleComplex magma_two = MAGMA_Z_MAKE(2.0, 0.0); if (rs.invert_mode == RuntimeSettings::INV_MODE_LU) { // >>> LU INVERSION <<< magmaDoubleComplex *d_a; // pointer to first element on device magmaDoubleComplex *dwork; // work space pointer on device magma_int_t ldwork = m * magma_get_zgetri_nb(m); // optimal block size Loading @@ -95,18 +97,48 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const Runt magma_zgetrf_gpu(m, m, d_a, m, piv, &info); magma_zgetri_gpu(m, d_a, m, piv, dwork, ldwork, &info); delete[] piv; // delete piv created by magma_zgetrf() magma_free(dwork); if (rs.use_refinement) { err = magma_newton(rs, a, m, d_a, queue); } magma_zgetmatrix(m, m, d_a , m, a, m, queue); // copy d_a -> a magma_free(d_a); magma_free(dwork); // >>> END OF LU INVERSION <<< } else if (rs.invert_mode == RuntimeSettings::INV_MODE_GESV) { // >>> GESV INVERSION <<< magmaDoubleComplex *h_id = new magmaDoubleComplex[mm](); magmaDoubleComplex *d_a, *d_id; magma_int_t *piv = new magma_int_t[m]; for (magma_int_t i = 0; i < m; i++) { h_id[i * (m + 1)] = magma_one; } err = magma_zmalloc(&d_id, mm); if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_id!\n"; rs.logger->err(message); exit(1); } magma_zsetmatrix(m, m, h_id, m, d_id , m, queue); // copy identity matrix to device delete[] h_id; // allocate matrix on device err = magma_zmalloc(&d_a, mm); // device memory for a if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_a!\n"; rs.logger->err(message); exit(1); } magma_zsetmatrix(m, m, a, m, d_a , m, queue); // copy a -> d_a magma_zgesv_gpu(m, m, d_a, m, piv, d_id, m, &info); delete[] piv; // free host memory magma_free(d_a); if (rs.use_refinement) { // refinement block printf("DEBUG: would refine GESV inversion.\n"); err = magma_newton(rs, a, m, d_id, queue); } magma_zgetmatrix(m, m, d_id , m, a, m, queue); // copy d_id -> a magma_free(d_id); // >>> END OF GESV INVERSION <<< } else if (rs.invert_mode == RuntimeSettings::INV_MODE_RBT) { // >>> RBT INVERSION <<< magmaDoubleComplex *d_a; // pointer to first element on device magma_bool_t iterate = (magma_bool_t)rs.use_refinement; magma_int_t *piv = new magma_int_t[m]; // host mem. Loading @@ -126,7 +158,106 @@ void magma_zinvert(dcomplex **mat, np_int n, int &jer, int device_id, const Runt delete[] h_id; delete[] piv; magma_free(d_a); // >>> END OF RBT INVERSION <<< } else if (rs.invert_mode == RuntimeSettings::INV_MODE_SVD) { // >>> SVD INVERSION <<< // Step 1: compute singular value decomposition magma_int_t lwork; magmaDoubleComplex *h_work = new magmaDoubleComplex[m]; magma_zgesdd( MagmaAllVec, m, m, NULL, m, NULL, NULL, m, NULL, m, h_work, -1, NULL, NULL, &info ); if (info == 0) { lwork = (magma_int_t)(MAGMA_Z_REAL(h_work[0])); message = "DEBUG: lwork = " + to_string(lwork) + "\n"; rs.logger->log(message, LOG_DEBG); } else { message = "ERROR: magma_zgesdd failed on resource querying call!\n"; rs.logger->err(message); exit(1); } delete[] h_work; h_work = new magmaDoubleComplex[lwork]; magmaDoubleComplex *h_a, *h_U, *h_Vt; h_a = new magmaDoubleComplex[mm]; memcpy(h_a, a, mm * sizeof(magmaDoubleComplex)); h_U = new magmaDoubleComplex[mm]; h_Vt = new magmaDoubleComplex[mm]; double *h_s = new double[m]; double *h_rwork = new double[5 * mm + 7 * m]; magma_int_t *h_iwork = new magma_int_t[8 * m]; magma_zgesdd( MagmaAllVec, m, m, h_a, m, h_s, h_U, m, h_Vt, m, h_work, lwork, h_rwork, h_iwork, &info ); delete[] h_work; delete[] h_rwork; delete[] h_iwork; delete[] h_a; sprintf(buffer, "%.5le", h_s[0]); message = "DEBUG: s[0] = "; message += buffer; message += "; s["; message += to_string(m - 1); message += "] = "; sprintf(buffer, "%.5le", h_s[m - 1]); message += buffer; message += ".\n"; rs.logger->log(message, LOG_DEBG); // Step 2: Upload decomposed matix to GPU magmaDoubleComplex *d_U, *d_Vt, *d_a; err = magma_zmalloc(&d_Vt, mm); if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_Vt!\n"; rs.logger->err(message); exit(1); } magma_zsetmatrix(m, m, h_Vt, m, d_Vt, m, queue); err = magma_zmalloc(&d_U, mm); if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_U!\n"; rs.logger->err(message); exit(1); } magma_zsetmatrix(m, m, h_U, m, d_U, m, queue); err = magma_zmalloc(&d_a, mm); if (err != MAGMA_SUCCESS) { message = "ERROR: could not allocate d_a!\n"; rs.logger->err(message); exit(1); } // Step 3: compute threshold-truncated pseudo-inverse on GPU double eps_mach = 2.22e-18; double threshold = h_s[0] * m * eps_mach; for (magma_int_t i = 0; i < m; i++) { magmaDoubleComplex inv_s = (h_s[i] > threshold) ? MAGMA_Z_MAKE(1.0 / h_s[i], 0.0) : MAGMA_Z_MAKE(0.0, 0.0); magma_zscal(m, inv_s, d_Vt + i, m, queue); } delete[] h_s; // d_a = d_vt^H * d_u^H magmablas_zgemm( MagmaConjTrans, MagmaConjTrans, m, m, m, magma_one, d_Vt, m, d_U, m, magma_zero, d_a, m, queue ); // Step 4: release matrix decomposition magma_free(d_U); magma_free(d_Vt); delete[] h_U; delete[] h_Vt; // Step 5: refine inversion if (rs.use_refinement) { err = magma_newton(rs, a, m, d_a, queue); } // Step 6: get result back to host magma_zgetmatrix(m, m, d_a , m, a, m, queue); // copy d_a -> a magma_free(d_a); // >>> END OF SVD INVERSION <<< } magma_queue_destroy(queue); // destroy queue jer = (int)err; Loading @@ -138,6 +269,7 @@ magma_int_t magma_newton( ) { magma_int_t err; string message; char buffer[128]; const int max_ref_iters = rs.max_ref_iters; const magmaDoubleComplex magma_zero = MAGMA_Z_MAKE(0.0, 0.0); const magmaDoubleComplex magma_one = MAGMA_Z_MAKE(1.0, 0.0); Loading Loading @@ -195,7 +327,9 @@ magma_int_t magma_newton( // Compute the norm of R norm = magma_dznrm2(mm, d_axa, 1, queue); double relative_res = norm / normA; message = "INFO: refinement iteration " + to_string(ri) + " achieved " + to_string(relative_res) + ".\n"; sprintf(buffer, "%.5le", relative_res); message = "INFO: refinement iteration " + to_string(ri) + " achieved relative residual of " + buffer + ".\n"; rs.logger->log(message); if (norm < 0.99 * old_norm) { // Transform d_ax in 2*I - A*X Loading @@ -209,7 +343,7 @@ magma_int_t magma_newton( // Set d_a = X*(2*I - A*X) magma_zcopy(mm, d_axa, 1, d_a, 1, queue); old_norm = norm; if (relative_res < 1.0e-14) { if (relative_res < 1.0e-16) { message = "DEBUG: good news - optimal convergence achieved. Stopping.\n"; rs.logger->log(message, LOG_DEBG); break; // ri for Loading