Commit b1fe094b authored by Mulas, Giacomo's avatar Mulas, Giacomo
Browse files

- do not allocate local variables for thread 0, only for additional ones

- remove redundant loop when reuniting outputs from all threads
parent c727f38b
Loading
Loading
Loading
Loading
+326 −269
Original line number Diff line number Diff line
@@ -46,54 +46,7 @@

using namespace std;

// I would like to put it all in a struct, but then I'd have to write a constructor for it, due to members defined as
// references, creating a worse nightmare than the one I'd like to simplify...
//
// struct clu_jxi488_cycle_args {
//   int jxi488;
//   ScattererConfiguration *sconf;
//   GeometryConfiguration *gconf;
//   C1 *c1;
//   C1_AddOns *c1ao;
//   C2 *c2;
//   C3 *c3;
//   C4 *c4;
//   C6 *c6;
//   C9 *c9;
//   FILE *output;
//   string output_path;
//   double *gaps;
//   double **tqse;
//   dcomplex **tqspe;
//   double **tqss;
//   dcomplex **tqsps;
//   double ****zpv;
//   double **gapm;
//   dcomplex **gappm;
//   int nth, int nths;
//   int nph;
//   int nphs;
//   int nk;
//   int nks;
//   int nkks;
//   double *argi;
//   double *args;
//   double **gap;
//   dcomplex **gapp;
//   double **tqce;
//   dcomplex **tqcpe;
//   double **tqcs;
//   dcomplex **tqcps;
//   double *duk;
//   fstream &tppoan;
//   double **cextlr;
//   double **cext;
//   double **cmullr;
//   double **cmul;
//   double *gapv;
//   double *tqev;
//   double *tqsv;
// }
// I would like to put it all in a struct, but then I'd have to write a constructor for it, due to members defined as references, creating a worse nightmare than the one I'd like to simplify...

int cluster_jxi488_cycle(int jxi488, ScattererConfiguration *sconf, GeometryConfiguration *gconf, C1 *c1, C1_AddOns *c1ao, C2 *c2, C3 *c3, C4 *c4, C6 *c6, C9 *c9, FILE *output, string output_path, double *gaps, double **tqse, dcomplex **tqspe, double **tqss, dcomplex **tqsps, double ****zpv, double **gapm, dcomplex **gappm, int nth, int nths, int nph, int nphs, int nk, int nks, int nkks, double *argi, double *args, double **gap, dcomplex **gapp, double **tqce, dcomplex **tqcpe, double **tqcs, dcomplex **tqcps, double *duk, fstream &tppoan, double **cextlr, double **cext, double **cmullr, double **cmul, double *gapv, double *tqev, double *tqsv, int nxi, int nsph, np_int mxndm, int inpol, int iavm, int npnt, int npntts, int isam, int lm, double th, double thstp, double thlst, double ths, double thsstp, double thslst, double ph, double phstp, double phlst, double phs, double phsstp, double phslst, double th1, double ph1, double ths1, double phs1, double thsca, double *u, double *us, double *un, double *uns, double *up, double *ups, double *unmp, double *unsmp, double *upmp, double *upsmp, double &scan, double &cfmp, double &sfmp, double &cfsp, double &sfsp, double sqsfi, double exri, int lcalc, dcomplex arg, double wn, double vk, np_int ndit, dcomplex **am, int isq, int ibf);

@@ -178,6 +131,7 @@ void cluster(string config_file, string data_file, string output_path) {
    }
    C2 *c2 = new C2(nsph, configurations, npnt, npntts);
    np_int ndit = 2 * nsph * c4->nlim;
    printf("Size of matrices to invert: %ld x %ld\n", (int64_t) ndit, (int64_t) ndit);
    dcomplex *am_vector = new dcomplex[ndit * ndit]();
    dcomplex **am = new dcomplex*[ndit];
    for (int ai = 0; ai < ndit; ai++) {
@@ -318,7 +272,8 @@ void cluster(string config_file, string data_file, string output_path) {
    double exri = sqrt(sconf->exdc);
    double vk = 0.0; // NOTE: Not really sure it should be initialized at 0
    fprintf(output, "  REFR. INDEX OF EXTERNAL MEDIUM=%15.7lE\n", exri);
    fstream tppoan;
    fstream *tppoanp = new fstream;
    fstream &tppoan = *tppoanp;
    string tppoan_name = output_path + "/c_TPPOAN";
    tppoan.open(tppoan_name.c_str(), ios::out | ios::binary);
    if (tppoan.is_open()) {
@@ -351,39 +306,180 @@ void cluster(string config_file, string data_file, string output_path) {

#pragma omp parallel
      {
	// To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one
	ScattererConfiguration *sconf_2 = new ScattererConfiguration(*sconf);
	GeometryConfiguration *gconf_2 = new GeometryConfiguration(*gconf);
	C1 *c1_2 = new C1(*c1);
	C1_AddOns *c1ao_2 = new C1_AddOns(*c1ao);
	C2 *c2_2 = new C2(*c2);
	C3 *c3_2 = new C3(*c3);
	C4 *c4_2 = new C4(*c4);
	C6 *c6_2 = new C6(*c6);
	C9 *c9_2 = new C9(*c9);
	// Create and initialise this variable here, so that if OpenMP is enabled it is local to the thread, and if OpenMP is not enabled it has a well-defiled value anyway
	int myompthread = 0;
#ifdef _OPENMP
	// If OpenMP is enabled, give actual values to myompthread and ompnumthreads, and open thread-local output files
	myompthread = omp_get_thread_num();
	if (myompthread == 0) ompnumthreads = omp_get_num_threads();
	FILE *output_2 = fopen((output_path + "/c_OCLU_" + to_string(myompthread)).c_str(), "w");
	fstream tppoan_2;
	tppoan_2.open((output_path + "/c_TPPOAN_" + to_string(myompthread)).c_str(), ios::out | ios::binary);
#else
	// If OpenMP is not enabled, just keep doing the output to the global files
	FILE *output_2 = output;
	fstream &tppoan_2 = tppoan;  
#endif
	double *gaps_2 = new double[nsph]();
	double **tqse_2 = new double*[2];
	double **tqce_2 = new double*[2];
	double **tqcs_2 = new double*[2];
	dcomplex **tqspe_2 = new dcomplex*[2];
	dcomplex **tqcpe_2 = new dcomplex*[2];
	dcomplex **tqcps_2 = new dcomplex*[2];
	double **tqss_2 = new double*[2];
	dcomplex **tqsps_2 = new dcomplex*[2];
	// To test parallelism, I will now start feeding this function with "clean" copies of the parameters, so that they will not be changed by previous iterations, and each one will behave as the first one. Define all (empty) variables here, so they have the correct scope, then they get different definitions depending on thread number
	ScattererConfiguration *sconf_2 = NULL;
	GeometryConfiguration *gconf_2 = NULL;
	C1 *c1_2 = NULL;
	C1_AddOns *c1ao_2 = NULL;
	C2 *c2_2 = NULL;
	C3 *c3_2 = NULL;
	C4 *c4_2 = NULL;
	C6 *c6_2 = NULL;
	C9 *c9_2 = NULL;
	FILE *output_2 = NULL;
	fstream *tppoanp_2 = NULL;
	double *gaps_2 = NULL;
	double **tqse_2 = NULL;
	double **tqce_2 = NULL;
	double **tqcs_2 = NULL;
	dcomplex **tqspe_2 = NULL;
	dcomplex **tqcpe_2 = NULL;
	dcomplex **tqcps_2 = NULL;
	double **tqss_2 = NULL;
	dcomplex **tqsps_2 = NULL;
	double ****zpv_2 = NULL;
	double **gapm_2 = NULL;
	dcomplex **gappm_2 = NULL;
	double **gap_2 = NULL;
	dcomplex **gapp_2 = NULL;
	double *argi_2 = NULL;
	double *args_2 = NULL;
	double *duk_2 = NULL;
	double **cextlr_2 = NULL;
	double **cext_2 = NULL;
	double **cmullr_2 = NULL;
	double **cmul_2 = NULL;
	double *gapv_2 = NULL;
	double *tqev_2 = NULL;
	double *tqsv_2 = NULL;
	int nxi_2 = nxi;
	int nsph_2 = nsph;
	np_int mxndm_2 = mxndm;
	int inpol_2 = inpol;
	int iavm_2 = iavm;
	int npnt_2 = npnt;
	int npntts_2 = npntts;
	int isam_2 = isam;
	int lm_2 = lm;
	double th_2 = th;
	double thstp_2 = thstp;
	double thlst_2 = thlst;
	double ths_2 = ths;
	double thsstp_2 = thsstp;
	double thslst_2 = thslst;
	double ph_2 = ph;
	double phstp_2 = phstp;
	double phlst_2 = phlst;
	double phs_2 = phs;
	double phsstp_2 = phsstp;
	double phslst_2 = phslst;
	double th1_2 = th1;
	double ph1_2 = ph1;
	double ths1_2 = ths1;
	double phs1_2 = phs1;
	double thsca_2 = thsca;
	double *u_2 = NULL;
	double *us_2 = NULL;
	double *un_2 = NULL;
	double *uns_2 = NULL;
	double *up_2 = NULL;
	double *ups_2 = NULL;
	double *unmp_2 = NULL;
	double *unsmp_2 = NULL;
	double *upmp_2 = NULL;
	double *upsmp_2 = NULL;
	double scan_2 = scan;
	double cfmp_2 = cfmp;
	double sfmp_2 = sfmp;
	double cfsp_2 = cfsp;
	double sfsp_2 = sfsp;
	double sqsfi_2 = sqsfi;
	double exri_2 = exri;
	int lcalc_2 = lcalc;
	dcomplex arg_2 = arg;
	double wn_2 = wn;
	double vk_2 = vk;
	np_int ndit_2 = ndit;
	dcomplex **am_2 = NULL;
	dcomplex *am_vector_2 = NULL;
	int isq_2 = isq;
	int ibf_2 = ibf;
	int nth_2 = nth;
	int nths_2 = nths;
	int nph_2 = nph;
	int nphs_2 = nph;
	int nk_2 = nk;
	int nks_2 = nks;
	int nkks_2 = nkks;
	// for threads other than the 0, create distinct copies of all relevant data, while for thread 0 just define new references / pointers to the original ones
	if (myompthread == 0) {
	  sconf_2 = sconf;
	  gconf_2 = gconf;
	  c1_2 = c1;
	  c1ao_2 = c1ao;
	  c2_2 = c2;
	  c3_2 = c3;
	  c4_2 = c4;
	  c6_2 = c6;
	  c9_2 = c9;
	  output_2 = output;
	  tppoanp_2 = tppoanp;  
	  gaps_2 = gaps;
	  tqse_2 = tqse;
	  tqce_2 = tqce;
	  tqcs_2 = tqcs;
	  tqspe_2 = tqspe;
	  tqcpe_2 = tqcpe;
	  tqcps_2 = tqcps;
	  tqss_2 = tqss;
	  tqsps_2 = tqsps;
	  zpv_2 = zpv;
	  gapm_2 = gapm;
	  gappm_2 = gappm;
	  gap_2 = gap;
	  gapp_2 = gapp;
	  argi_2 = argi;
	  args_2 = args;
	  duk_2 = duk;
	  cextlr_2 = cextlr;
	  cext_2 = cext;
	  cmullr_2 = cmullr;
	  cmul_2 = cmul;
	  gapv_2 = gapv;
	  tqev_2 = tqev;
	  tqsv_2 = tqsv;
	  u_2 = u;
	  us_2 = us;
	  un_2 = un;
	  uns_2 = uns;
	  up_2 = up;
	  ups_2 = ups;
	  unmp_2 = unmp;
	  unsmp_2 = unsmp;
	  upmp_2 = upmp;
	  upsmp_2 = upsmp;
	  am_2 = am;
	}
	else {
	  // this is not thread 0, so do create fresh copies of all local variables
	  sconf_2 = new ScattererConfiguration(*sconf);
	  gconf_2 = new GeometryConfiguration(*gconf);
	  c1_2 = new C1(*c1);
	  c1ao_2 = new C1_AddOns(*c1ao);
	  c2_2 = new C2(*c2);
	  c3_2 = new C3(*c3);
	  c4_2 = new C4(*c4);
	  c6_2 = new C6(*c6);
	  c9_2 = new C9(*c9);
	  output_2 = fopen((output_path + "/c_OCLU_" + to_string(myompthread)).c_str(), "w");
	  tppoanp_2 = new fstream;
	  tppoanp_2->open((output_path + "/c_TPPOAN_" + to_string(myompthread)).c_str(), ios::out | ios::binary);
	  gaps_2 = new double[nsph]();
	  tqse_2 = new double*[2];
	  tqce_2 = new double*[2];
	  tqcs_2 = new double*[2];
	  tqspe_2 = new dcomplex*[2];
	  tqcpe_2 = new dcomplex*[2];
	  tqcps_2 = new dcomplex*[2];
	  tqss_2 = new double*[2];
	  tqsps_2 = new dcomplex*[2];
	  for (int ti = 0; ti < 2; ti++) {
	    tqse_2[ti] = new double[nsph]();
	    tqce_2[ti] = new double[3]();
@@ -406,7 +502,7 @@ void cluster(string config_file, string data_file, string output_path) {
	      tqcps_2[ti][tj] = tqcps[ti][tj];
	    }
	  }
	double ****zpv_2 = new double***[c4->lm]; //[gconf->l_max][3][2][2]; // Matrix: dim[LM x 3 x 2 x 2]
	  zpv_2 = new double***[c4->lm]; //[gconf->l_max][3][2][2]; // Matrix: dim[LM x 3 x 2 x 2]
	  for (int zi = 0; zi < c4->lm; zi++) {
	    zpv_2[zi] = new double**[3];
	    for (int zj = 0; zj < 3; zj++) {
@@ -417,10 +513,10 @@ void cluster(string config_file, string data_file, string output_path) {
	      }
	    }
	  }
	double **gapm_2 = new double*[3];
	dcomplex **gappm_2 = new dcomplex*[3];
	double **gap_2 = new double*[3];
	dcomplex **gapp_2 = new dcomplex*[3];
	  gapm_2 = new double*[3];
	  gappm_2 = new dcomplex*[3];
	  gap_2 = new double*[3];
	  gapp_2 = new dcomplex*[3];
	  for (int gi = 0; gi < 3; gi++) {
	    gap_2[gi] = new double[2]();
	    gapm_2[gi] = new double[2]();
@@ -433,16 +529,16 @@ void cluster(string config_file, string data_file, string output_path) {
	      gappm_2[gi][gj] = gappm[gi][gj];
	    }
	  }
	double *argi_2 = new double[1]();
	  argi_2 = new double[1]();
	  argi_2[0] = argi[0];
	double *args_2 = new double[1]();
	  args_2 = new double[1]();
	  args_2[0] = args[0];
	double *duk_2 = new double[3]();
	  duk_2 = new double[3]();
	  for (int di=0; di<3; di++) duk_2[di] = duk[di];
	double **cextlr_2 = new double*[4];
	double **cext_2 = new double*[4];
	double **cmullr_2 = new double*[4];
	double **cmul_2 = new double*[4];
	  cextlr_2 = new double*[4];
	  cext_2 = new double*[4];
	  cmullr_2 = new double*[4];
	  cmul_2 = new double*[4];
	  for (int ci = 0; ci < 4; ci++) {
	    cextlr_2[ci] = new double[4]();
	    cext_2[ci] = new double[4]();
@@ -455,50 +551,24 @@ void cluster(string config_file, string data_file, string output_path) {
	      cmul_2[ci][cj] = cmul[ci][cj];
	    }
	  }
	double *gapv_2 = new double[3]();
	  gapv_2 = new double[3]();
	  for (int gi=0; gi<3; gi++) gapv_2[gi] = gapv[gi];
	double *tqev_2 = new double[3]();
	double *tqsv_2 = new double[3]();
	  tqev_2 = new double[3]();
	  tqsv_2 = new double[3]();
	  for (int ti=0; ti<3; ti++) {
	    tqev_2[ti] = tqev[ti];
	    tqsv_2[ti] = tqsv[ti];
	  }
	int nxi_2 = nxi;
	int nsph_2 = nsph;
	np_int mxndm_2 = mxndm;
	int inpol_2 = inpol;
	int iavm_2 = iavm;
	int npnt_2 = npnt;
	int npntts_2 = npntts;
	int isam_2 = isam;
	int lm_2 = lm;
	double th_2 = th;
	double thstp_2 = thstp;
	double thlst_2 = thlst;
	double ths_2 = ths;
	double thsstp_2 = thsstp;
	double thslst_2 = thslst;
	double ph_2 = ph;
	double phstp_2 = phstp;
	double phlst_2 = phlst;
	double phs_2 = phs;
	double phsstp_2 = phsstp;
	double phslst_2 = phslst;
	double th1_2 = th1;
	double ph1_2 = ph1;
	double ths1_2 = ths1;
	double phs1_2 = phs1;
	double thsca_2 = thsca;
	double *u_2 = new double[3]();
	double *us_2 = new double[3]();
	double *un_2 = new double[3]();
	double *uns_2 = new double[3]();
	double *up_2 = new double[3]();
	double *ups_2 = new double[3]();
	double *unmp_2 = new double[3]();
	double *unsmp_2 = new double[3]();
	double *upmp_2 = new double[3]();
	double *upsmp_2 = new double[3]();
	  u_2 = new double[3]();
	  us_2 = new double[3]();
	  un_2 = new double[3]();
	  uns_2 = new double[3]();
	  up_2 = new double[3]();
	  ups_2 = new double[3]();
	  unmp_2 = new double[3]();
	  unsmp_2 = new double[3]();
	  upmp_2 = new double[3]();
	  upsmp_2 = new double[3]();
	  for (int ti=0; ti<3; ti++) {
	    u_2[ti] = u[ti];
	    us_2[ti] = us[ti];
@@ -511,41 +581,25 @@ void cluster(string config_file, string data_file, string output_path) {
	    upmp_2[ti] = upmp[ti];
	    upsmp_2[ti] = upsmp[ti];
	  }
	double scan_2 = scan;
	double cfmp_2 = cfmp;
	double sfmp_2 = sfmp;
	double cfsp_2 = cfsp;
	double sfsp_2 = sfsp;
	double sqsfi_2 = sqsfi;
	double exri_2 = exri;
	int lcalc_2 = lcalc;
	dcomplex arg_2 = arg;
	double wn_2 = wn;
	double vk_2 = vk;
	np_int ndit_2 = ndit;
	dcomplex **am_2 = new dcomplex*[ndit];
	dcomplex *am_vector_2 = new dcomplex[ndit * ndit]();
	  am_2 = new dcomplex*[ndit];
	  am_vector_2 = new dcomplex[ndit * ndit]();
	  for (int ai = 0; ai < ndit; ai++) {
	    am_2[ai] = (am_vector_2 + ai * ndit);
	  }
	int isq_2 = isq;
	int ibf_2 = ibf;
	int nth_2 = nth;
	int nths_2 = nths;
	int nph_2 = nph;
	int nphs_2 = nph;
	int nk_2 = nk;
	int nks_2 = nks;
	int nkks_2 = nkks;

	
	}
	fstream &tppoan_2 = *tppoanp_2;
	// make sure all threads align here: I don't want the following loop to accidentally start for thread 0, possibly modifying some variables before they are copied by all other threads
#pragma omp barrier
	if (myompthread==0) printf("Syncing OpenMP threads and starting the loop on wavelengths\n");
	// ok, now I can actually start the parallel calculations
#pragma omp for
	for (jxi488 = 2; jxi488 <= nxi; jxi488++) {
	  jer = cluster_jxi488_cycle(jxi488, sconf_2, gconf_2, c1_2, c1ao_2, c2_2, c3_2, c4_2, c6_2, c9_2, output_2, output_path, gaps_2, tqse_2, tqspe_2, tqss_2, tqsps_2, zpv_2, gapm_2, gappm_2, nth_2, nths_2, nph_2, nphs_2, nk_2, nks_2, nkks_2, argi_2, args_2, gap_2, gapp_2, tqce_2, tqcpe_2, tqcs_2, tqcps_2, duk_2, tppoan_2, cextlr_2, cext_2, cmullr_2, cmul_2, gapv_2, tqev_2, tqsv_2, nxi_2, nsph_2, mxndm_2, inpol_2, iavm_2, npnt_2, npntts_2, isam_2, lm_2, th_2, thstp_2, thlst_2, ths_2, thsstp_2, thslst_2, ph_2, phstp_2, phlst_2, phs_2, phsstp_2, phslst_2, th1_2, ph1_2, ths1_2, phs1_2, thsca_2, u_2, us_2, un_2, uns_2, up_2, ups_2, unmp_2, unsmp_2, upmp_2, upsmp_2, scan_2, cfmp_2, sfmp_2, cfsp_2, sfsp_2, sqsfi_2, exri_2, lcalc_2, arg_2, wn_2, vk_2, ndit_2, am_2, isq_2, ibf_2);
	}

#pragma omp barrier
	// only threads different from 0 have to free local copies of variables and close local files
	if (myompthread != 0) {
	  delete sconf_2;
	  delete gconf_2;
	  delete c1_2;
@@ -555,14 +609,9 @@ void cluster(string config_file, string data_file, string output_path) {
	  delete c4_2;
	  delete c6_2;
	  delete c9_2;
#ifdef _OPENMP
	  fclose(output_2);
	tppoan_2.close();
#pragma omp barrier
	{
	  printf("Closing thread-local output files of thread %d and syncing threads\n", myompthread);
	}
#endif
	  tppoanp_2->close();
	  delete tppoanp_2;
	  delete[] gaps_2;
	  for (int ti = 0; ti <2 -1; ti++) {
	    delete[] tqse_2[ti];
@@ -630,11 +679,17 @@ void cluster(string config_file, string data_file, string output_path) {
	  delete[] upsmp_2;
	  delete[] am_vector_2;
	  delete[] am_2;
	}
#pragma omp barrier
	{
	  printf("Closed thread-local output files of thread %d, syncing threads\n", myompthread);
	}
      } // closes pragma omp parallel
#ifdef _OPENMP
#pragma omp barrier
      {
	for (int ri = 0; ri < ompnumthreads; ri++) {
	// thread 0 already wrote on global files, skip it and take care of appending the others
	for (int ri = 1; ri < ompnumthreads; ri++) {
	  // Giovanni, please add here in this loop code to reopen the temporary files, reread them and append them respectively to the global output and tppoan, before closing them
	  string partial_file_name = output_path + "/c_OCLU_" + to_string(ri);
	  printf("Copying ASCII output of thread %d of %d... ", ri, ompnumthreads - 1);
@@ -647,9 +702,10 @@ void cluster(string config_file, string data_file, string output_path) {
	  fclose(partial_output);
	  remove(partial_file_name.c_str());
	  printf("done.\n");
	}
	for (int ri = 0; ri < ompnumthreads; ri++) {
	  string partial_file_name = output_path + "/c_TPPOAN_" + to_string(ri);
	  //	}
	  //for (int ri = 1; ri < ompnumthreads; ri++) {
	  //string partial_file_name = output_path + "/c_TPPOAN_" + to_string(ri);
	  partial_file_name = output_path + "/c_TPPOAN_" + to_string(ri);
	  printf("Copying binary output of thread %d of %d... ", ri, ompnumthreads - 1);
	  fstream partial_tppoan;
	  partial_tppoan.open(partial_file_name.c_str(), ios::in | ios::binary);
@@ -666,7 +722,8 @@ void cluster(string config_file, string data_file, string output_path) {
	}
      }
#endif
      tppoan.close();
      tppoanp->close();
      delete tppoanp;
    } else { // In case TPPOAN could not be opened. Should never happen.
      printf("\nERROR: failed to open TPPOAN file.\n");
    }