Commit 93521048 authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

fix races in ztm

parent 704e038d
Loading
Loading
Loading
Loading
+19 −10
Original line number Original line Diff line number Diff line
@@ -2534,7 +2534,7 @@ void tqr(
}
}


void ztm(dcomplex **am, ParticleDescriptor *c1) {
void ztm(dcomplex **am, ParticleDescriptor *c1) {
  dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4;
  // dcomplex gie, gle, a1, a2, a3, a4, sum1, sum2, sum3, sum4;
  const dcomplex cc0 = 0.0 + 0.0 * I;
  const dcomplex cc0 = 0.0 + 0.0 * I;
  // int i2 = 0; // old implementation
  // int i2 = 0; // old implementation
#ifdef USE_NVTX
#ifdef USE_NVTX
@@ -2546,7 +2546,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
  // C9 *c9_para = new C9(*c9);
  // C9 *c9_para = new C9(*c9);
  dcomplex *gis_v = c1->gis[0];
  dcomplex *gis_v = c1->gis[0];
  dcomplex *gls_v = c1->gls[0];
  dcomplex *gls_v = c1->gls[0];
  double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double));
  int k2max = c1->li*(c1->li+2);
  int k2max = c1->li*(c1->li+2);
  int k3max = c1->le*(c1->le+2);
  int k3max = c1->le*(c1->le+2);
  // To parallelise, I run a linearised loop directly over k
  // To parallelise, I run a linearised loop directly over k
@@ -2559,6 +2558,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
  // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
  // im = im -(2*l+1) and l = l+1 (there was a rounding error in a nearly exact root)
#ifdef USE_TARGET_OFFLOAD
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(3)
#pragma omp target teams distribute parallel for simd collapse(3)
#else
#pragma omp parallel for simd collapse(3)
#endif
#endif
  for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
  for (int n2 = 1; n2 <= c1->nsph; n2++) { // GPU portable?
    for (int k2 = 1; k2<=k2max; k2++) {
    for (int k2 = 1; k2<=k2max; k2++) {
@@ -2590,12 +2591,13 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
	int i3 = l3 * l3 + im3 - 1;
	int i3 = l3 * l3 + im3 - 1;
	int m3 = -l3 - 1 + im3;
	int m3 = -l3 - 1 + im3;
	int vecindex = (i2 - 1) * c1->nlem + i3 - 1;
	int vecindex = (i2 - 1) * c1->nlem + i3 - 1;
	double *rac3j_local = (double *) malloc(c1->lmtpo*sizeof(double));
	gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local);
	gis_v[vecindex] = ghit_d(2, 0, n2, l2, m2, l3, m3, c1, rac3j_local);
	gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local);
	gls_v[vecindex] = ghit_d(2, 1, n2, l2, m2, l3, m3, c1, rac3j_local);
	free(rac3j_local);
      } // close k3 loop, former l3 + im3 loops
      } // close k3 loop, former l3 + im3 loops
    } // close k2 loop, former l2 + im2 loops
    } // close k2 loop, former l2 + im2 loops
  } // close n2 loop
  } // close n2 loop
  free(rac3j_local);
#ifdef USE_NVTX
#ifdef USE_NVTX
  nvtxRangePop();
  nvtxRangePop();
#endif
#endif
@@ -2606,6 +2608,8 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
  dcomplex *sam_v = c1->sam[0];
  dcomplex *sam_v = c1->sam[0];
#ifdef USE_TARGET_OFFLOAD
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2)
#pragma omp target teams distribute parallel for simd collapse(2)
#else
#pragma omp parallel for simd collapse(2)
#endif
#endif
  for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?
  for (int i1 = 1; i1 <= c1->ndi; i1++) { // GPU portable?
    for (int i3 = 1; i3 <= c1->nlem; i3++) {
    for (int i3 = 1; i3 <= c1->nlem; i3++) {
@@ -2615,7 +2619,6 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
      dcomplex sum4 = cc0;
      dcomplex sum4 = cc0;
      int i1e = i1 + c1->ndi;
      int i1e = i1 + c1->ndi;
      int i3e = i3 + c1->nlem;
      int i3e = i3 + c1->nlem;
#pragma parallel for simd reduction(+:sum1,sum2,sum3,sum4)
      for (int i2 = 1; i2 <= c1->ndi; i2++) {
      for (int i2 = 1; i2 <= c1->ndi; i2++) {
	int i2e = i2 + c1->ndi;
	int i2e = i2 + c1->ndi;
	int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1;
	int vecindg_23 = (i2 - 1) * c1->nlem + i3 - 1;
@@ -2640,7 +2643,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
      sam_v[vecind1e + i3e - 1] = sum4;
      sam_v[vecind1e + i3e - 1] = sum4;
    } // i3 loop
    } // i3 loop
  } // i1 loop
  } // i1 loop
#pragma omp parallel for collapse(2)
#ifdef USE_TARGET_OFFLOAD
#pragma omp target teams distribute parallel for simd collapse(2)
#else
#pragma omp parallel for simd collapse(2)
#endif 
  for (int i1 = 1; i1 <= c1->ndi; i1++) {
  for (int i1 = 1; i1 <= c1->ndi; i1++) {
    for (int i0 = 1; i0 <= c1->nlem; i0++) {
    for (int i0 = 1; i0 <= c1->nlem; i0++) {
      int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
      int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
@@ -2650,7 +2657,9 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
  } // i1 loop
  } // i1 loop
  dcomplex *vec_am0m = c1->am0m[0];
  dcomplex *vec_am0m = c1->am0m[0];
#ifdef USE_TARGET_OFFLOAD
#ifdef USE_TARGET_OFFLOAD
#pragma omp target parallel for collapse(2)
#pragma omp target teams distribute parallel for simd collapse(2)
#else
#pragma omp parallel for simd collapse(2)
#endif
#endif
  for (int i0 = 1; i0 <= c1->nlem; i0++) {
  for (int i0 = 1; i0 <= c1->nlem; i0++) {
    for (int i3 = 1; i3 <= c1->nlemt; i3++) {
    for (int i3 = 1; i3 <= c1->nlemt; i3++) {
@@ -2661,11 +2670,11 @@ void ztm(dcomplex **am, ParticleDescriptor *c1) {
	int i1e = i1 + c1->ndi;
	int i1e = i1 + c1->ndi;
	int vecind1 = (i1 - 1) * c1->nlemt;
	int vecind1 = (i1 - 1) * c1->nlemt;
	int vecind1e = (i1e - 1) * c1->nlemt;
	int vecind1e = (i1e - 1) * c1->nlemt;
	a1 = sam_v[vecind1 + i3 - 1];
	dcomplex a1 = sam_v[vecind1 + i3 - 1];
	a2 = sam_v[vecind1e + i3 - 1];
	dcomplex a2 = sam_v[vecind1e + i3 - 1];
	int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
	int vecindex = (i1 - 1) * c1->nlem + i0 - 1;
	gie = gis_v[vecindex];
	dcomplex gie = gis_v[vecindex];
	gle = gls_v[vecindex];
	dcomplex gle = gls_v[vecindex];
	sum1 += (a1 * gie + a2 * gle);
	sum1 += (a1 * gie + a2 * gle);
	sum2 += (a1 * gle + a2 * gie);
	sum2 += (a1 * gle + a2 * gie);
      } // i1 loop
      } // i1 loop