Modify CLUSTER to perform multi-centered matrix initialization and inversion... (29407bf2) · Commits · Giacomo Mulas / NP_TMcode

src/cluster/cluster.cpp

+64 −7

Original line number	Diff line number	Diff line
		@@ -886,16 +886,72 @@ int cluster_jxi488_cycle(
		delete outam0;
		#endif // DEBUG_AM
		#ifdef USE_TARGET_OFFLOAD
		if (rs.use_offload) {
		#ifdef USE_MAGMA
		magmaDoubleComplex* vec_am = (magmaDoubleComplex *)(cid->am[0]);
		int device_id = cid->proc_device;
		#pragma omp target data map(tofrom: vec_am[0:ndit*ndit]) device(device_id)
		magma_queue_t queue = NULL;
		magma_device_t device_id = (magma_device_t)(cid->proc_device);
		magma_queue_create(device_id, &queue);
		#pragma omp target data map(to: vec_am[0:ndit*ndit]) device(cid->proc_device)
		{
		magma_cms(vec_am, cid->c1, cid->proc_device);
		magma_cms(vec_am, cid->c1, cid->proc_device); // Initialize uninverted matrix on GPU

		/*
		// DEBUG STEP
		#pragma omp target update from(vec_am[0:ndit*ndit]) device(cid->proc_device)
		message = "c_magma_cms_jxi_" + to_string(jxi488) + ".ppm";
		write_matrix_as_ppm(cid->am[0], ndit, ndit, message, "MAG", 1, 1);
		{
		VirtualAsciiFile *outam1 = new VirtualAsciiFile();
		string outam1_name = output_path + "/c_MAGMA_AM1_JXI" + to_string(jxi488) + ".txt";
		sprintf(virtual_line, " AM matrix after CMS before LUCIN\n");
		outam1->append_line(virtual_line);
		sprintf(virtual_line, " %d\n", ndit);
		outam1->append_line(virtual_line);
		sprintf(virtual_line, " I1+1 I2+1 Real Imag\n");
		outam1->append_line(virtual_line);
		write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1);
		outam1->write_to_disk(outam1_name);
		delete outam1;
		}
		*/

		#pragma omp target data use_device_ptr(vec_am) device(cid->proc_device)
		{
		magma_zinvert_resident(vec_am, (magma_int_t)ndit, jer, queue, cid->proc_device, rs);
		magma_queue_sync(queue);
		}
		if (jer == 0) {
		// Get the final inverted matrix.
		#pragma omp target update from(vec_am[0:ndit*ndit]) device(cid->proc_device)
		} else {
		sprintf(virtual_line, "ERROR: matrix inversion returned code %d!\n", jer);
		message = virtual_line;
		logger->err(message);
		}
		}
		magma_queue_destroy(queue);
		#else // NO_USE_MAGMA
		cms_flat(cid->am[0], cid->c1);
		#endif // USE_MAGMA
		#endif // USE_TARGET_OFFLOAD
		cms_gpu(cid->am[0], cid->c1);
		}
		#else // NO_USE_TARGET_OFFLOAD
		cms(cid->am[0], cid->c1);
		message = "c_cpu_cms_jxi_" + to_string(jxi488) + ".ppm";
		write_matrix_as_ppm(cid->am[0], ndit, ndit, message, "MAG", 1, 1);
		{
		VirtualAsciiFile *outam1 = new VirtualAsciiFile();
		string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt";
		sprintf(virtual_line, " AM matrix after CMS before LUCIN\n");
		outam1->append_line(virtual_line);
		sprintf(virtual_line, " %d\n", ndit);
		outam1->append_line(virtual_line);
		sprintf(virtual_line, " I1+1 I2+1 Real Imag\n");
		outam1->append_line(virtual_line);
		write_dcomplex_matrix(outam1, cid->am, ndit, ndit, " %5d %5d (%17.8lE,%17.8lE)\n", 1);
		outam1->write_to_disk(outam1_name);
		delete outam1;
		}
		#ifdef DEBUG_AM
		VirtualAsciiFile *outam1 = new VirtualAsciiFile();
		string outam1_name = output_path + "/c_AM1_JXI" + to_string(jxi488) + ".txt";
		@@ -947,6 +1003,7 @@ int cluster_jxi488_cycle(
		return jer;
		// break; // jxi488 loop: goes to memory clean
		}
		#endif // USE_TARGET_OFFLOAD
		interval_start = chrono::high_resolution_clock::now();
		#ifdef USE_NVTX
		nvtxRangePush("Average calculation");

src/include/clu_subs.h

+4 −4

Original line number	Diff line number	Diff line
		@@ -104,10 +104,10 @@ double cgev(int ipamo, int mu, int l, int m);
		* This function constructs the multi-centered M-matrix of the cluster, according
		* to Eq. (5.28) of Borghese, Denti & Saija (2007).
		*
		* \param am: `complex double **`
		* \param c1: `ParticleDescriptor *`
		* \param[in,out] am: `complex double *`
		* \param[in] c1: `ParticleDescriptor *`
		*/
		void cms(dcomplex *am, ParticleDescriptor c1);
		void cms(dcomplex am, ParticleDescriptor c1);

		#ifdef USE_TARGET_OFFLOAD
		/**
		@@ -119,7 +119,7 @@ void cms(dcomplex *am, ParticleDescriptor c1);
		* \param am: `complex double **`
		* \param c1: `ParticleDescriptor *`
		*/
		void cms_gpu(dcomplex am, ParticleDescriptor c1);
		void cms_flat(dcomplex am, ParticleDescriptor c1);
		#endif // USE_TARGET_OFFLOAD

		/**

src/include/magma_calls.h

+19 −0

Original line number	Diff line number	Diff line
		@@ -105,11 +105,30 @@ template<int IHI> magmaDoubleComplex magma_ghit(
		void magma_r3jjr(int j2, int j3, int m2, int m3, double rac3j[]);
		#pragma omp end declare target

		/**
		* \brief Invert a pre-existing GPU complex matrix with double precision elements.
		*
		* \param[in,out] mat: Matrix of complex. The matrix to be inverted.
		* \param[in] n: `magma_int_t` The number of rows and columns of the [n x n] matrix.
		* \param[out] jer: `int &` Reference to an integer return flag.
		* \param[in] queue: `magma_queue_t` The MAGMA device communication queue.
		* \param[in] device_id: `int` ID of the device for matrix inversion offloading (optional, default 0).
		* \param[in] rs: `const RuntimeSettings &` Runtime settings instance (optional).
		*/
		void magma_zinvert_resident(
		magmaDoubleComplex *mat, magma_int_t n, int &jer, magma_queue_t queue, int device_id=0,
		const RuntimeSettings& rs=RuntimeSettings()
		);
		#endif // USE_TARGET_OFFLOAD

		/**
		* \brief Invert a complex matrix with double precision elements.
		*
		* This function is a first implementation of GPU use. The data transfer
		* steps are handled internally, via standard MAGMA calls, so that the matrix
		* to be inverted is copied from the host memory to the GPU, inverted, possibly
		* refined and finally taken back to the host, leaving the GPU free.
		*
		* \param[in,out] mat: Matrix of complex. The matrix to be inverted.
		* \param[in] n: `np_int` The number of rows and columns of the [n x n] matrix.
		* \param[out] jer: `int &` Reference to an integer return flag.

src/include/utils.h

+9 −9

Original line number	Diff line number	Diff line
		@@ -56,19 +56,19 @@ int write_dcomplex_matrix(
		/**
		* \brief Draw a PPM representation of a matrix.
		*
		* \param A: `const dcomplex *` Vector representation of the matrix. [IN]
		* \param m: `int` Number of rows in the matrix. [IN]
		* \param n: `int` Number of columns in the matrix. [IN]
		* \param file_name: `const string&` Reference to a string with the name of the output file. [IN]
		* \param mode: `const string&` Reference to a string for the value to be represented ("MAG" to
		* \param[in] A: `const dcomplex *` Vector representation of the matrix.
		* \param[in] m: `int64_t` Number of rows in the matrix.
		* \param[in] n: `int64_t` Number of columns in the matrix.
		* \param[in] file_name: `const string&` Reference to a string with the name of the output file.
		* \param[in] mode: `const string&` Reference to a string for the value to be represented ("MAG" to
		* draw magnitudes, "RE" to draw real parts, "IM" to draw imaginary parts. Optional, default is
		* "MAG"). [IN]
		* \param row_bin: `int` Number of rows to bin together (optional, default is 1). [IN]
		* \param col_bin: `int` Number of columns to bin together (optional, default is 1). [IN]
		* "MAG").
		* \param[in] row_bin: `int` Number of rows to bin together (optional, default is 1).
		* \param[in] col_bin: `int` Number of columns to bin together (optional, default is 1).
		* \return result: `int` An exit code (0 if successful).
		*/
		int write_matrix_as_ppm(
		const dcomplex *A, int m, int n, const std::string& file_name, const std::string& mode="MAG",
		dcomplex *A, int64_t m, int64_t n, const std::string& file_name, const std::string& mode="MAG",
		int row_bin=1, int col_bin=1
		);
		#endif

src/libnptm/clu_subs.cpp

+23 −21

Original line number	Diff line number	Diff line
		@@ -457,13 +457,14 @@ double cgev(int ipamo, int mu, int l, int m) {
		}
		*/

		void cms(dcomplex *am, ParticleDescriptor c1) {
		void cms(dcomplex am, ParticleDescriptor c1) {
		const int nsph = c1->nsph;
		const int nlim = c1->nlim;
		const int li = c1->li;
		const int ndi = nsph * nlim;
		const int ndit = ndi + ndi;
		const int nsphmo = nsph - 1;
		double *rac3j = c1->rac3j;
		const int lmtpo = c1->lmtpo;

		// int nbl = 0;
		for (int n1 = 1; n1 <= nsphmo; n1++) {
		@@ -492,6 +493,7 @@ void cms(dcomplex *am, ParticleDescriptor c1) {
		double rsh = ((l2 + l1) % 2 == 0) ? 1.0 : -1.0;
		double rsk = -rsh;
		for (int im2 = 1; im2 <= l2tpo; im2++) {
		double rac3j[lmtpo];
		int m2 = im2 - l2po;
		int ilm2 = il2 + m2;
		int ilm2e = ilm2 + ndi;
		@@ -501,14 +503,14 @@ void cms(dcomplex *am, ParticleDescriptor c1) {
		int j2e = in1 + ilm2e;
		dcomplex cgh = ghit(0, 0, nbl, l1, m1, l2, m2, c1, rac3j);
		dcomplex cgk = ghit(0, 1, nbl, l1, m1, l2, m2, c1, rac3j);
		am[i1 - 1][i2 - 1] = cgh;
		am[i1 - 1][i2e - 1] = cgk;
		am[i1e - 1][i2 - 1] = cgk;
		am[i1e - 1][i2e - 1] = cgh;
		am[j1 - 1][j2 - 1] = cgh * rsh;
		am[j1 - 1][j2e - 1] = cgk * rsk;
		am[j1e - 1][j2 - 1] = cgk * rsk;
		am[j1e - 1][j2e - 1] = cgh * rsh;
		am[(i1 - 1) * ndit + i2 - 1] = cgh;
		am[(i1 - 1) * ndit + i2e - 1] = cgk;
		am[(i1e - 1) * ndit + i2 - 1] = cgk;
		am[(i1e - 1) * ndit + i2e - 1] = cgh;
		am[(j1 - 1) * ndit + j2 - 1] = cgh * rsh;
		am[(j1 - 1) * ndit + j2e - 1] = cgk * rsk;
		am[(j1e - 1) * ndit + j2 - 1] = cgk * rsk;
		am[(j1e - 1) * ndit + j2e - 1] = cgh * rsh;
		} // im2 loop
		} // l2 loop
		} // im1 loop
		@@ -516,7 +518,7 @@ void cms(dcomplex *am, ParticleDescriptor c1) {
		} // n2 loop
		} // n1 loop
		#pragma omp parallel for collapse(2)
		for (int n1 = 1; n1 <= nsph; n1++) { // GPU portable?
		for (int n1 = 1; n1 <= nsph; n1++) {
		for (int l1 = 1; l1 <= li; l1++) {
		int in1 = (n1 - 1) * nlim;
		dcomplex dm = c1->rmi[l1 - 1][n1 - 1];
		@@ -533,20 +535,20 @@ void cms(dcomplex *am, ParticleDescriptor c1) {
		// for (int ilm2 = 1; ilm2 <= c1->nlim; ilm2++) {
		// int i2 = in1 + ilm2;
		// int i2e = i2 + ndi;
		// am[i1 - 1][i2 - 1] = cc0;
		// am[i1 - 1][i2e - 1] = cc0;
		// am[i1e - 1][i2 - 1] = cc0;
		// am[i1e - 1][i2e - 1] = cc0;
		// am[(i1 - 1) * ndit + i2 - 1] = cc0;
		// am[(i1 - 1) * ndit + i2e - 1] = cc0;
		// am[(i1e - 1) * ndit + i2 - 1] = cc0;
		// am[(i1e - 1) * ndit + i2e - 1] = cc0;
		// }
		am[i1 - 1][i1 - 1] = dm;
		am[i1e - 1][i1e - 1] = de;
		am[(i1 - 1) * ndit + i1 - 1] = dm;
		am[(i1e - 1) * ndit + i1e - 1] = de;
		} // im1 loop
		} // l1 loop
		} // n1 loop
		}

		#ifdef USE_TARGET_OFFLOAD
		void cms_gpu(dcomplex am, ParticleDescriptor c1) {
		void cms_flat(dcomplex am, ParticleDescriptor c1) {
		const int nsph = c1->nsph;
		const int nlim = c1->nlim;
		const int li = c1->li;