Loading cuda-omp/hybrid/hybrid_cuda_omp.c 0 → 100644 +159 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Passing OpenMP data to cuBlas. // // Author: David Goz // mail : david.goz@inaf.it // date : 02.09.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./hybrid_cuda_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <cuda_runtime.h> #include <cublas_v2.h> #include <assert.h> #include <float.h> #include <math.h> #include <stdio.h> #include <stdlib.h> #define N 512 #define SIZE ((N) * (N)) #define ALPHA 1.0 #define BETA 0.0 typedef double MyData; void InitHost(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { //#pragma omp parallel for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } } void InitDev(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { #pragma omp target teams loop collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } return; } void HostDgemm(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, const MyData alpha, const MyData beta) { // C = alpha * A * B + beta * C; // naive calculation // #pragma omp parallel for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { MyData sum = 0.0; for (int k=0 ; k<N ; k++) sum += A[(i * N) + k] * B[(k * N) + j]; C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]); } return; } void check(MyData *const restrict host_array, MyData *const restrict dev_array) { int flag = 0; for (size_t i=0 ; i<SIZE ; i++) flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK"); else printf("\n\t Result wrong"); return; } int main() { // Host allocation MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + SIZE; MyData *const restrict C = B + SIZE; MyData *const restrict CC = C + SIZE; // Spawning 2 host threads #pragma omp parallel num_threads(2) { // Evaluate the Dgemm on the host #pragma omp single nowait { InitHost(A, B, CC); HostDgemm(A, B, CC, ALPHA, BETA); } // omp single #pragma omp single nowait { // Initialize cuBLAS library cublasHandle_t handle; cublasCreate(&handle); // Allocate A, B, C on the device #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE]) // Init device with blocking omp target directive InitDev(A, B, C); // Define a target data region where A, B, and C pointers // refer to device's address space #pragma omp target data use_device_addr(A, B, C) { MyData const alpha = ALPHA; MyData const beta = BETA; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, A, N, B, N, &beta, C, N); // CUDA synchronization point cudaDeviceSynchronize(); } // Fetch data from the device and deallocate #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE]) cublasDestroy(handle); } // omp single } // synchronization point check(CC, C); free(buffer); return 0; } cuda-omp/omp/miscellaneous/asynchronous.c 0 → 100644 +110 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Splitting the asynchronous vector addition task graph across four devices // // Author: David Goz // mail : david.goz@inaf.it // date : 28.08.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v asynchronous.c -o asynchronous_omp // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./asynchronous_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <omp.h> #include <assert.h> typedef int MyData; #define NDEBUG void check(const MyData *const C, const size_t size) { int flag = 0; for (size_t i=0 ; i<size ; i++) flag = ((C[i] != 98) ? 1 : flag); if (flag) printf("\n\t Result wrong \n"); else printf("\n\t Result OK \n"); return; } int main() { // alloc data on the device const int NumDev = omp_get_num_devices(); if (NumDev != 4) { printf("\n\t The program runs using 4 GPUs... aborting...\n"); exit(EXIT_FAILURE); } const int size = 1000000; MyData *buffer = (MyData *)malloc(3 * size * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + size; MyData *const restrict C = B + size; // init A on GPU 0 #pragma omp target nowait \ map(tofrom: A[0:size]) \ depend(out: A[0:size]) \ device(0) { #pragma omp loop for (int i=0 ; i<size ; i++) A[i] = (MyData)i; } // device 0 // init B on GPU 1 #pragma omp target nowait \ map(tofrom: B[0:size]) \ depend(out: B[0:size]) \ device(1) { #pragma omp loop for (int i=0 ; i<size ; i++) B[i] = (MyData)-i; } // device 1 // init C on GPU 2 #pragma omp target nowait \ map(tofrom: C[0:size]) \ depend(out: C[0:size]) \ device(2) { #pragma omp loop for (int i=0 ; i<size ; i++) C[i] = 99; } // device 2 // perform the calculation on GPU 3 #pragma omp target nowait \ map(to: A[0:size], B[0:size]) map(tofrom: C[0:size]) \ depend(in: A[0:size], B[0:size]) depend(inout: C[0:size]) \ device(3) { #pragma omp loop for (int i=0 ; i<size ; i++) C[i] += (A[i] + B[i] - 1); } // device 3 #pragma omp task depend(in: C[0:size]) check(C, size); free(buffer); return 0; } cuda-omp/omp/miscellaneous/dependencies.c 0 → 100644 +95 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Task dependencies: // example of synchronization by defining explicit dependencies between the tasks. // // Author: David Goz // mail : david.goz@inaf.it // date : 28.08.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v dependencies.c -o dependencies_omp // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./dependencies_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <omp.h> #include <assert.h> typedef int MyData; #define NDEBUG void check(const MyData *const C, const size_t size) { int flag = 0; for (size_t i=0 ; i<size ; i++) flag = ((C[i] != 0) ? 1 : flag); if (flag) printf("\n\t Result wrong \n"); else printf("\n\t Result OK \n"); return; } int main() { const int size = 1000000; MyData *C = (MyData *)malloc(size * sizeof(MyData)); assert(C != NULL); // alloc data on the device const int dev_gpu = omp_get_default_device(); MyData *gpu_buffer = (MyData *)omp_target_alloc(2 * size * sizeof(MyData), dev_gpu); assert(gpu_buffer != NULL); MyData *A = gpu_buffer; MyData *B = A + size; // init C with random number for (int i=0 ; i<size ; i++) C[i] = rand() % size; #pragma omp target enter data map(to: C[0:size]) // init A #pragma omp target nowait depend(out: A[0:size]) is_device_ptr(A) { #pragma omp loop for (int i=0 ; i<size; i++) A[i] = i; } // init B #pragma omp target nowait depend(out: B[0:size]) is_device_ptr(B) { #pragma omp loop for (int i=0 ; i<size; i++) B[i] = -i; } // vector add #pragma omp target nowait depend(in: A[0:size], B[0:size]) depend(out: C[0:size]) is_device_ptr(A, B) { #pragma omp loop for (int i=0 ; i<size; i++) C[i] = A[i] + B[i]; } #pragma omp target update from (C[0:size]) depend(in: C[0:size]) check(C, size); omp_target_free(gpu_buffer, dev_gpu); free(C); #pragma omp target exit data map(release: C[0:size]) return 0; } cuda-omp/omp/miscellaneous/multiple_devices.c +50 −36 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Offload to multiple devices within the same node: // - using one OMP host-thread per device synchronously; // - using one OMP host-thread asynchronously. // // Author: David Goz // mail : david.goz@inaf.it // date : 27.08.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v multiple_devices.c -o multiple_devices_omp // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./multiple_devices_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <omp.h> Loading @@ -15,30 +34,10 @@ typedef int MyData; #error "N_PER_DEV < BLOCKSIZE" #endif #define NDEBUG void check(const MyData *const restrict vector_cpu, const MyData *const restrict vector_gpu, const size_t size) { int flag = 0; for (size_t i=0 ; i<size ; i++) { #if !defined(NDEBUG) printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d", i, vector_cpu[i], i, vector_gpu[i]); #endif flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag); } if (flag) printf("\n\t Result wrong \n"); else printf("\n\t Result OK \n"); #define TRUE 1 #define FALSE 0 return; } #define NDEBUG void VectorAdd(const MyData *const restrict A, const MyData *const restrict B, Loading @@ -46,7 +45,8 @@ void VectorAdd(const MyData *const restrict A, const int offset, const int size, const int dev, const int nblocks) const int nblocks, const int async) { #pragma omp target \ teams num_teams(nblocks) thread_limit(BLOCKSIZE) \ Loading Loading @@ -84,6 +84,8 @@ int main() // get the number of the available devices const int NumDev = omp_get_num_devices(); const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); // global vector size const int size = (NumDev * N_PER_DEV); assert(size > 0); Loading @@ -103,6 +105,7 @@ int main() C_CPU[i] = A[i] + B[i]; } // each device is managed by a single OMP thread #pragma omp parallel num_threads(NumDev) { // check Loading @@ -119,11 +122,22 @@ int main() const int tid = omp_get_thread_num(); const int offset = (tid * N_PER_DEV); const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks); VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE); } // omp parallel check(C_CPU, C_GPU, size); memset(C_GPU, 0, (size * sizeof(MyData))); // one OMP thread manages asynchronously all the devices for (int dev=0 ; dev<NumDev ; dev++) { const int offset = (dev * N_PER_DEV); VectorAdd(A, B, C_GPU, offset, N_PER_DEV, dev, nblocks, TRUE); } // host-devices synchronization #pragma omp taskwait check(C_CPU, C_GPU, size); free(buffer); Loading cuda-omp/omp/miscellaneous/task_reductions.c 0 → 100644 +83 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // // Author: David Goz // mail : david.goz@inaf.it // date : 28.08.2024 // code tested using nvhpc // // The code does not compile with nvc // The code compiles with clang //////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <omp.h> #include <assert.h> typedef int MyData; #define N_PER_DEV 1000000 #define NDEBUG int main() { // get the number of the available devices const int NumDev = omp_get_num_devices(); // global vector size const int size = (NumDev * N_PER_DEV); assert(size > 0); MyData *buffer = (MyData *)malloc(2 * size * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + size; MyData sum_cpu = (MyData)0; #pragma omp parallel for simd reduction(+: sum_cpu) for (int i=0 ; i<size ; i++) { A[i] = rand() % N_PER_DEV; B[i] = rand() % N_PER_DEV; sum_cpu += A[i] + B[i]; } MyData sum_gpu = (MyData)0; #pragma omp parallel num_threads(NumDev) reduction(task, +:sum_gpu) { #pragma omp single { if (NumDev != omp_get_num_threads()) exit(EXIT_FAILURE); else { printf("\n\t Using %d GPUs \n", NumDev); fflush(stdout); } } // implicit barrier const int tid = omp_get_thread_num(); const int offset = (tid * N_PER_DEV); #pragma omp target \ map(to: A[offset:N_PER_DEV], B[offset:N_PER_DEV]) \ device(tid) \ in_reduction(+: sum_gpu) #pragma omp loop reduction(+: sum_gpu) for (int i=offset ; i<(offset + N_PER_DEV) ; i++) sum_gpu += (A[i] + B[i]); } // omp parallel if (sum_cpu == sum_gpu) printf("\n\t Result OK \n"); else printf("\n\t Result wrong \n"); free(buffer); return 0; } Loading
cuda-omp/hybrid/hybrid_cuda_omp.c 0 → 100644 +159 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Passing OpenMP data to cuBlas. // // Author: David Goz // mail : david.goz@inaf.it // date : 02.09.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./hybrid_cuda_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <cuda_runtime.h> #include <cublas_v2.h> #include <assert.h> #include <float.h> #include <math.h> #include <stdio.h> #include <stdlib.h> #define N 512 #define SIZE ((N) * (N)) #define ALPHA 1.0 #define BETA 0.0 typedef double MyData; void InitHost(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { //#pragma omp parallel for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } } void InitDev(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { #pragma omp target teams loop collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } return; } void HostDgemm(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, const MyData alpha, const MyData beta) { // C = alpha * A * B + beta * C; // naive calculation // #pragma omp parallel for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { MyData sum = 0.0; for (int k=0 ; k<N ; k++) sum += A[(i * N) + k] * B[(k * N) + j]; C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]); } return; } void check(MyData *const restrict host_array, MyData *const restrict dev_array) { int flag = 0; for (size_t i=0 ; i<SIZE ; i++) flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK"); else printf("\n\t Result wrong"); return; } int main() { // Host allocation MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + SIZE; MyData *const restrict C = B + SIZE; MyData *const restrict CC = C + SIZE; // Spawning 2 host threads #pragma omp parallel num_threads(2) { // Evaluate the Dgemm on the host #pragma omp single nowait { InitHost(A, B, CC); HostDgemm(A, B, CC, ALPHA, BETA); } // omp single #pragma omp single nowait { // Initialize cuBLAS library cublasHandle_t handle; cublasCreate(&handle); // Allocate A, B, C on the device #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE]) // Init device with blocking omp target directive InitDev(A, B, C); // Define a target data region where A, B, and C pointers // refer to device's address space #pragma omp target data use_device_addr(A, B, C) { MyData const alpha = ALPHA; MyData const beta = BETA; cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, A, N, B, N, &beta, C, N); // CUDA synchronization point cudaDeviceSynchronize(); } // Fetch data from the device and deallocate #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE]) cublasDestroy(handle); } // omp single } // synchronization point check(CC, C); free(buffer); return 0; }
cuda-omp/omp/miscellaneous/asynchronous.c 0 → 100644 +110 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Splitting the asynchronous vector addition task graph across four devices // // Author: David Goz // mail : david.goz@inaf.it // date : 28.08.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v asynchronous.c -o asynchronous_omp // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./asynchronous_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <omp.h> #include <assert.h> typedef int MyData; #define NDEBUG void check(const MyData *const C, const size_t size) { int flag = 0; for (size_t i=0 ; i<size ; i++) flag = ((C[i] != 98) ? 1 : flag); if (flag) printf("\n\t Result wrong \n"); else printf("\n\t Result OK \n"); return; } int main() { // alloc data on the device const int NumDev = omp_get_num_devices(); if (NumDev != 4) { printf("\n\t The program runs using 4 GPUs... aborting...\n"); exit(EXIT_FAILURE); } const int size = 1000000; MyData *buffer = (MyData *)malloc(3 * size * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + size; MyData *const restrict C = B + size; // init A on GPU 0 #pragma omp target nowait \ map(tofrom: A[0:size]) \ depend(out: A[0:size]) \ device(0) { #pragma omp loop for (int i=0 ; i<size ; i++) A[i] = (MyData)i; } // device 0 // init B on GPU 1 #pragma omp target nowait \ map(tofrom: B[0:size]) \ depend(out: B[0:size]) \ device(1) { #pragma omp loop for (int i=0 ; i<size ; i++) B[i] = (MyData)-i; } // device 1 // init C on GPU 2 #pragma omp target nowait \ map(tofrom: C[0:size]) \ depend(out: C[0:size]) \ device(2) { #pragma omp loop for (int i=0 ; i<size ; i++) C[i] = 99; } // device 2 // perform the calculation on GPU 3 #pragma omp target nowait \ map(to: A[0:size], B[0:size]) map(tofrom: C[0:size]) \ depend(in: A[0:size], B[0:size]) depend(inout: C[0:size]) \ device(3) { #pragma omp loop for (int i=0 ; i<size ; i++) C[i] += (A[i] + B[i] - 1); } // device 3 #pragma omp task depend(in: C[0:size]) check(C, size); free(buffer); return 0; }
cuda-omp/omp/miscellaneous/dependencies.c 0 → 100644 +95 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Task dependencies: // example of synchronization by defining explicit dependencies between the tasks. // // Author: David Goz // mail : david.goz@inaf.it // date : 28.08.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v dependencies.c -o dependencies_omp // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./dependencies_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <omp.h> #include <assert.h> typedef int MyData; #define NDEBUG void check(const MyData *const C, const size_t size) { int flag = 0; for (size_t i=0 ; i<size ; i++) flag = ((C[i] != 0) ? 1 : flag); if (flag) printf("\n\t Result wrong \n"); else printf("\n\t Result OK \n"); return; } int main() { const int size = 1000000; MyData *C = (MyData *)malloc(size * sizeof(MyData)); assert(C != NULL); // alloc data on the device const int dev_gpu = omp_get_default_device(); MyData *gpu_buffer = (MyData *)omp_target_alloc(2 * size * sizeof(MyData), dev_gpu); assert(gpu_buffer != NULL); MyData *A = gpu_buffer; MyData *B = A + size; // init C with random number for (int i=0 ; i<size ; i++) C[i] = rand() % size; #pragma omp target enter data map(to: C[0:size]) // init A #pragma omp target nowait depend(out: A[0:size]) is_device_ptr(A) { #pragma omp loop for (int i=0 ; i<size; i++) A[i] = i; } // init B #pragma omp target nowait depend(out: B[0:size]) is_device_ptr(B) { #pragma omp loop for (int i=0 ; i<size; i++) B[i] = -i; } // vector add #pragma omp target nowait depend(in: A[0:size], B[0:size]) depend(out: C[0:size]) is_device_ptr(A, B) { #pragma omp loop for (int i=0 ; i<size; i++) C[i] = A[i] + B[i]; } #pragma omp target update from (C[0:size]) depend(in: C[0:size]) check(C, size); omp_target_free(gpu_buffer, dev_gpu); free(C); #pragma omp target exit data map(release: C[0:size]) return 0; }
cuda-omp/omp/miscellaneous/multiple_devices.c +50 −36 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Offload to multiple devices within the same node: // - using one OMP host-thread per device synchronously; // - using one OMP host-thread asynchronously. // // Author: David Goz // mail : david.goz@inaf.it // date : 27.08.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v multiple_devices.c -o multiple_devices_omp // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./multiple_devices_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <omp.h> Loading @@ -15,30 +34,10 @@ typedef int MyData; #error "N_PER_DEV < BLOCKSIZE" #endif #define NDEBUG void check(const MyData *const restrict vector_cpu, const MyData *const restrict vector_gpu, const size_t size) { int flag = 0; for (size_t i=0 ; i<size ; i++) { #if !defined(NDEBUG) printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d", i, vector_cpu[i], i, vector_gpu[i]); #endif flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag); } if (flag) printf("\n\t Result wrong \n"); else printf("\n\t Result OK \n"); #define TRUE 1 #define FALSE 0 return; } #define NDEBUG void VectorAdd(const MyData *const restrict A, const MyData *const restrict B, Loading @@ -46,7 +45,8 @@ void VectorAdd(const MyData *const restrict A, const int offset, const int size, const int dev, const int nblocks) const int nblocks, const int async) { #pragma omp target \ teams num_teams(nblocks) thread_limit(BLOCKSIZE) \ Loading Loading @@ -84,6 +84,8 @@ int main() // get the number of the available devices const int NumDev = omp_get_num_devices(); const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); // global vector size const int size = (NumDev * N_PER_DEV); assert(size > 0); Loading @@ -103,6 +105,7 @@ int main() C_CPU[i] = A[i] + B[i]; } // each device is managed by a single OMP thread #pragma omp parallel num_threads(NumDev) { // check Loading @@ -119,11 +122,22 @@ int main() const int tid = omp_get_thread_num(); const int offset = (tid * N_PER_DEV); const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks); VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks, FALSE); } // omp parallel check(C_CPU, C_GPU, size); memset(C_GPU, 0, (size * sizeof(MyData))); // one OMP thread manages asynchronously all the devices for (int dev=0 ; dev<NumDev ; dev++) { const int offset = (dev * N_PER_DEV); VectorAdd(A, B, C_GPU, offset, N_PER_DEV, dev, nblocks, TRUE); } // host-devices synchronization #pragma omp taskwait check(C_CPU, C_GPU, size); free(buffer); Loading
cuda-omp/omp/miscellaneous/task_reductions.c 0 → 100644 +83 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // // Author: David Goz // mail : david.goz@inaf.it // date : 28.08.2024 // code tested using nvhpc // // The code does not compile with nvc // The code compiles with clang //////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <omp.h> #include <assert.h> typedef int MyData; #define N_PER_DEV 1000000 #define NDEBUG int main() { // get the number of the available devices const int NumDev = omp_get_num_devices(); // global vector size const int size = (NumDev * N_PER_DEV); assert(size > 0); MyData *buffer = (MyData *)malloc(2 * size * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + size; MyData sum_cpu = (MyData)0; #pragma omp parallel for simd reduction(+: sum_cpu) for (int i=0 ; i<size ; i++) { A[i] = rand() % N_PER_DEV; B[i] = rand() % N_PER_DEV; sum_cpu += A[i] + B[i]; } MyData sum_gpu = (MyData)0; #pragma omp parallel num_threads(NumDev) reduction(task, +:sum_gpu) { #pragma omp single { if (NumDev != omp_get_num_threads()) exit(EXIT_FAILURE); else { printf("\n\t Using %d GPUs \n", NumDev); fflush(stdout); } } // implicit barrier const int tid = omp_get_thread_num(); const int offset = (tid * N_PER_DEV); #pragma omp target \ map(to: A[offset:N_PER_DEV], B[offset:N_PER_DEV]) \ device(tid) \ in_reduction(+: sum_gpu) #pragma omp loop reduction(+: sum_gpu) for (int i=offset ; i<(offset + N_PER_DEV) ; i++) sum_gpu += (A[i] + B[i]); } // omp parallel if (sum_cpu == sum_gpu) printf("\n\t Result OK \n"); else printf("\n\t Result wrong \n"); free(buffer); return 0; }