Loading cuda-omp/omp/miscellaneous/globals/Makefile +2 −2 Original line number Diff line number Diff line COMPILER_CXX ?= clang++-18 COMPILER_CXX ?= clang++ DEBUG ?= YES FLAGS ?= -fopenmp --offload-arch=native -fopenmp-targets=nvptx64-nvidia-cuda FLAGS ?= # -fopenmp # --offload-arch=native -fopenmp-targets=nvptx64-nvidia-cuda # executable name EXEC ?= globals Loading cuda-omp/omp/miscellaneous/globals/allvars.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ constexpr std::size_t X = 3; constexpr std::size_t Y = 6; constexpr std::size_t Z = 65536; using MyData = double; using MyData = int; // Global pointer declared in target region #pragma omp declare target Loading cuda-omp/omp/miscellaneous/globals/globals.cpp +32 −37 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ #include <omp.h> #include <cassert> #include <new> #include <algorithm> #include "allvars.hpp" Loading @@ -23,54 +24,46 @@ int main() for (std::size_t z=0 ; z<Z ; z++) { global_ptr[x][y][z] = static_cast<MyData>(1); global_ptr[x][y][z] = static_cast<MyData>(z + 1); } // loop over Z } // loop over Y } // loop over X std::cout << "\n\t global_ptr allocated on the host \n" << std::endl; // // Allocate memory on the device and set the global pointer // #pragma omp target enter data map(alloc: global_ptr[0:1][0:6][0:SIZE]) // Allocate memory on the device and set the global pointer #pragma omp target enter data map(to: global_ptr[0:X][0:Y][0:Z]) // for // // Copy data from host to device // #pragma omp target data map(to: host_data[0: SIZE]) // { // #pragma omp target teams distribute parallel for // for (int index=0 ; index<SIZE ; index++) // { // const int tid = omp_get_thread_num(); // const int team = omp_get_team_num(); // const int nthr = omp_get_num_threads(); // const int whoAmI = tid + (team * nthr); // MyData diff[6]; // for (std::size_t i=0 ; i<6 : i++) // { // diff[i] = global_ptr[0][i][index] * ; // } #pragma omp target teams distribute parallel for for (std::size_t index=0 ; index<Z ; index++) { MyData diff = 0; for (std::size_t x=0 ; x<X ; x++) for (std::size_t y=0 ; y<Y ; y++) diff += global_ptr[x][y][index]; // { // global_ptr[i] = (host_data[i] * 2); // } // } // kernel for (std::size_t x=0 ; x<X ; x++) for (std::size_t y=0 ; y<Y ; y++) global_ptr[x][y][index] = (diff / (X * Y * (index + 1))); } // kernel // // Copy data back from device to host using the global pointer // #pragma omp target update from(global_ptr[0: SIZE]) // } // Device-host synchronization #pragma omp target update from(global_ptr[0:X][0:Y][0:Z]) // std::cout << "\n\t Result after device computation:" << std::endl; // for (std::size_t i=0 ; i<SIZE ; i++) // { // std::cout << global_ptr[i] << " "; // } // std::cout << std::endl; // Check if any element along Z is equal to 1 for (std::size_t x=0 ; x<X ; x++) for (std::size_t y=0 ; y<Y ; y++) { const bool One = std::all_of(&global_ptr[x][y][0], &global_ptr[x][y][Z], [](const MyData x) {return (x == 1);}); if (One == false) { std::cout << "\n\t Test failed \n" << std::endl; return -1; } } // Deallocate memory on the device //#pragma omp target exit data map(delete: global_ptr) #pragma omp target exit data map(delete: global_ptr[0:X][0:Y][0:Z]) // deallocate host memory for (std::size_t x=0 ; x<X ; x++) Loading @@ -83,5 +76,7 @@ int main() } delete[] global_ptr; std::cout << "\n\t Test OK! \n" << std::endl; return 0; } Loading
cuda-omp/omp/miscellaneous/globals/Makefile +2 −2 Original line number Diff line number Diff line COMPILER_CXX ?= clang++-18 COMPILER_CXX ?= clang++ DEBUG ?= YES FLAGS ?= -fopenmp --offload-arch=native -fopenmp-targets=nvptx64-nvidia-cuda FLAGS ?= # -fopenmp # --offload-arch=native -fopenmp-targets=nvptx64-nvidia-cuda # executable name EXEC ?= globals Loading
cuda-omp/omp/miscellaneous/globals/allvars.hpp +1 −1 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ constexpr std::size_t X = 3; constexpr std::size_t Y = 6; constexpr std::size_t Z = 65536; using MyData = double; using MyData = int; // Global pointer declared in target region #pragma omp declare target Loading
cuda-omp/omp/miscellaneous/globals/globals.cpp +32 −37 Original line number Diff line number Diff line Loading @@ -3,6 +3,7 @@ #include <omp.h> #include <cassert> #include <new> #include <algorithm> #include "allvars.hpp" Loading @@ -23,54 +24,46 @@ int main() for (std::size_t z=0 ; z<Z ; z++) { global_ptr[x][y][z] = static_cast<MyData>(1); global_ptr[x][y][z] = static_cast<MyData>(z + 1); } // loop over Z } // loop over Y } // loop over X std::cout << "\n\t global_ptr allocated on the host \n" << std::endl; // // Allocate memory on the device and set the global pointer // #pragma omp target enter data map(alloc: global_ptr[0:1][0:6][0:SIZE]) // Allocate memory on the device and set the global pointer #pragma omp target enter data map(to: global_ptr[0:X][0:Y][0:Z]) // for // // Copy data from host to device // #pragma omp target data map(to: host_data[0: SIZE]) // { // #pragma omp target teams distribute parallel for // for (int index=0 ; index<SIZE ; index++) // { // const int tid = omp_get_thread_num(); // const int team = omp_get_team_num(); // const int nthr = omp_get_num_threads(); // const int whoAmI = tid + (team * nthr); // MyData diff[6]; // for (std::size_t i=0 ; i<6 : i++) // { // diff[i] = global_ptr[0][i][index] * ; // } #pragma omp target teams distribute parallel for for (std::size_t index=0 ; index<Z ; index++) { MyData diff = 0; for (std::size_t x=0 ; x<X ; x++) for (std::size_t y=0 ; y<Y ; y++) diff += global_ptr[x][y][index]; // { // global_ptr[i] = (host_data[i] * 2); // } // } // kernel for (std::size_t x=0 ; x<X ; x++) for (std::size_t y=0 ; y<Y ; y++) global_ptr[x][y][index] = (diff / (X * Y * (index + 1))); } // kernel // // Copy data back from device to host using the global pointer // #pragma omp target update from(global_ptr[0: SIZE]) // } // Device-host synchronization #pragma omp target update from(global_ptr[0:X][0:Y][0:Z]) // std::cout << "\n\t Result after device computation:" << std::endl; // for (std::size_t i=0 ; i<SIZE ; i++) // { // std::cout << global_ptr[i] << " "; // } // std::cout << std::endl; // Check if any element along Z is equal to 1 for (std::size_t x=0 ; x<X ; x++) for (std::size_t y=0 ; y<Y ; y++) { const bool One = std::all_of(&global_ptr[x][y][0], &global_ptr[x][y][Z], [](const MyData x) {return (x == 1);}); if (One == false) { std::cout << "\n\t Test failed \n" << std::endl; return -1; } } // Deallocate memory on the device //#pragma omp target exit data map(delete: global_ptr) #pragma omp target exit data map(delete: global_ptr[0:X][0:Y][0:Z]) // deallocate host memory for (std::size_t x=0 ; x<X ; x++) Loading @@ -83,5 +76,7 @@ int main() } delete[] global_ptr; std::cout << "\n\t Test OK! \n" << std::endl; return 0; }