use less memory for identity subtraction in iterative refinement (35cd635f) · Commits · Giacomo Mulas / NP_TMcode

src/libnptm/cublas_calls.cpp

+6 −6

Original line number	Diff line number	Diff line
		@@ -143,12 +143,12 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a_residual, mmsizeof(cuDoubleComplex)));
		cudacall(cudaMalloc<cuDoubleComplex>(&d_a_refine, mmsizeof(cuDoubleComplex)));
		// allocate memory for the temporary matrix products
		dcomplex *native_id = new dcomplex[m];
		for (np_int i=0; i<m; i++) native_id[i] = 1;
		dcomplex *native_id = new dcomplex[1];
		native_id[0] = 1;
		cuDoubleComplex m_id = (cuDoubleComplex ) &(native_id[0]);
		// fill it with 1
		cudacall(cudaMalloc<cuDoubleComplex>(&d_id, m*sizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_id, m_id, m*sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
		cudacall(cudaMalloc<cuDoubleComplex>(&d_id, sizeof(cuDoubleComplex)));
		cudacall(cudaMemcpy(d_id, m_id, sizeof(cuDoubleComplex),cudaMemcpyHostToDevice)); // copy identity to device vector
		delete[] native_id; // free identity vector on host
		}
		bool iteraterefine = true;
		@@ -167,7 +167,7 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
		// Fortran-style arrays, whereas our arrays are C-style row-major.
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
		// add the identity to the product
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
		double oldmax=0;
		if (refinemode >0) {
		np_int maxindex;
		@@ -191,7 +191,7 @@ void cublas_zinvert_and_refine(dcomplex **mat, np_int n, int &maxiters, double &
		// multiply minus the original matrix times the new inverse matrix
		cublascall(CUZGEMM(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, m, &cu_mone, d_c, m, d_a, m, &cu_zero, d_a_residual, m));
		// add the identity to the product
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 1, d_a_residual, m+1));
		cublascall(CUZAXPY(handle, m, &cu_one, d_id, 0, d_a_residual, m+1));
		if ((refinemode==2) \|\| ((refinemode==1) && (iter == (maxiters-1)))) {
		// find the maximum absolute value of the residual
		np_int maxindex;

src/libnptm/lapack_calls.cpp

+8 −9

Original line number	Diff line number	Diff line
		@@ -92,6 +92,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
		#endif
		np_int nn = n*n;
		np_int incx = 1;
		np_int incx0 = 0;
		#ifdef USE_MKL
		MKL_Complex16 *arr_orig = NULL;
		MKL_Complex16 *arr_residual = NULL;
		@@ -108,17 +109,15 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
		arr_orig = new MKL_Complex16[nn];
		arr_residual = new MKL_Complex16[nn];
		arr_refine = new MKL_Complex16[nn];
		id = new MKL_Complex16[n];
		for (np_int i=0; i<n ; i++) {
		id[i].real = 1;
		id[i].imag = 0;
		}
		id = new MKL_Complex16[1];
		id[0].real = 1;
		id[0].imag = 0;
		#else
		arr_orig = new dcomplex[nn];
		arr_residual = new dcomplex[nn];
		arr_refine = new dcomplex[nn];
		id = new dcomplex[n];
		for (np_int i=0; i<n ; i++) id[i] = (dcomplex) 1;
		id = new dcomplex[1];
		id[0] = (dcomplex) 1;
		#endif
		zcopy_(&nn, arr, &incx, arr_orig, &incx);
		}
		@@ -153,7 +152,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
		// Fortran-style arrays, whereas our arrays are C-style row-major.
		zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
		np_int incy = n+1;
		zaxpy_(&n, &dcone, id, &incx, arr_residual, &incy);
		zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
		double oldmax = 0;
		if (refinemode >0) {
		np_int maxindex = izamax_(&nn, arr_residual, &incx);
		@@ -171,7 +170,7 @@ void zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters, doubl
		zaxpy_(&nn, &dcone, arr_refine, &incx, arr, &incx);
		// zcopy_(&nn, arr_refine, &incx, arr, &incx);
		zgemm_(&transa, &transa, &n, &n, &n, &dcmone, arr, &n, arr_orig, &n, &dczero, arr_residual, &n);
		zaxpy_(&n, &dcone, id, &incx, arr_residual, &incy);
		zaxpy_(&n, &dcone, id, &incx0, arr_residual, &incy);
		if ((refinemode==2) \|\| ((refinemode==1) && (iter == (maxiters-1)))) {
		np_int maxindex = izamax_(&nn, arr_residual, &incx);
		#ifdef USE_MKL

src/libnptm/magma_calls.cpp

+6 −6

Original line number	Diff line number	Diff line
		@@ -132,16 +132,16 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters,
		}
		// allocate memory for the identity vector on the host
		{
		dcomplex *native_id = new dcomplex[m];
		for (magma_int_t i=0; i<m; i++) native_id[i] = 1;
		dcomplex *native_id = new dcomplex[1];
		native_id[0] = 1;
		magmaDoubleComplex id = (magmaDoubleComplex ) &(native_id[0]);
		// fill it with 1
		err = magma_zmalloc(&d_id, m);
		err = magma_zmalloc(&d_id, 1);
		if (err != MAGMA_SUCCESS) {
		printf("Error allocating d_id\n");
		exit(1);
		}
		magma_zsetvector(m, id, 1, d_id, 1, queue); // copy identity to device vector
		magma_zsetvector(1, id, 1, d_id, 1, queue); // copy identity to device vector
		delete[] native_id; // free identity vector on host
		}
		}
		@@ -161,7 +161,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters,
		// Fortran-style arrays, whereas our arrays are C-style row-major.
		magma_zgemm(MagmaNoTrans, MagmaNoTrans, m, m, m, magma_mone, d_a, m, d_a_orig, m, magma_zero, d_a_residual, m, queue);
		// add the identity to the product
		magma_zaxpy (m, magma_one, d_id, 1, d_a_residual, m+1, queue);
		magma_zaxpy (m, magma_one, d_id, 0, d_a_residual, m+1, queue);
		double oldmax=0;
		if (refinemode >0) {
		// find the maximum absolute value of the residual
		@@ -184,7 +184,7 @@ void magma_zinvert_and_refine(dcomplex **mat, np_int n, int &jer, int &maxiters,
		// multiply minus the original matrix times the new inverse matrix
		magma_zgemm(MagmaNoTrans, MagmaNoTrans, m, m, m, magma_mone, d_a, m, d_a_orig, m, magma_zero, d_a_residual, m, queue);
		// add the identity to the product
		magma_zaxpy (m, magma_one, d_id, 1, d_a_residual, m+1, queue);
		magma_zaxpy (m, magma_one, d_id, 0, d_a_residual, m+1, queue);
		if ((refinemode==2) \|\| ((refinemode==1) && (iter == (maxiters-1)))) {
		// find the maximum absolute value of the residual
		magma_int_t maxindex = magma_izamax(mm, d_a_residual, 1, queue);