Loading cuda-omp/cuda/7/mat_mult_block.cu +2 −2 Original line number Diff line number Diff line Loading @@ -257,9 +257,9 @@ int main() const dim3 block = {(BLOCK * BLOCK), 1, 1}; /////////////////////////// GPU naive block algorithm //////////////////////////////////////// time = 0.0; GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up cudaDeviceSynchronize(); time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); Loading @@ -274,9 +274,9 @@ int main() ///////////////////////////////////////////////////////////////////////////////////// /////////////////////////// GPU block shared memory algorithm /////////////////////// time = 0.0; GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up cudaDeviceSynchronize(); time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); Loading cuda-omp/hybrid/hybrid_cublas_omp.c 0 → 100644 +282 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Using CUDA data in OpenMP // // Author: David Goz // mail : david.goz@inaf.it // date : 03.09.2024 // code tested using nvhpc // // - Compile the code: // - using nvc // $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // - using clang // $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./hybrid_cuda_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <cuda_runtime.h> #include <cublas_v2.h> #include <assert.h> #include <float.h> #include <math.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <omp.h> #define N 2048 #define SIZE ((N) * (N)) #define ALPHA 1.0 #define BETA 0.0 #define HOST 0 #define DEV 1 #define LOOP 10 #define INIT 0 #define KERNEL 1 #define DATA 2 typedef double MyData; static double _time[2][3]; static int thr[2]; double process_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } double thread_time() { struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void InitHost(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, int *const restrict thr) { double start; #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } #pragma omp master { _time[HOST][INIT] += (thread_time() - start); } } // omp parallel return; } void InitDev(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { const double start = thread_time(); #pragma omp target teams loop collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } _time[DEV][INIT] += (thread_time() - start); return; } void HostDgemm(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, const MyData alpha, const MyData beta, int *const restrict thr) { // C = alpha * A * B + beta * C; double start; // naive calculation #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { MyData sum = 0.0; for (int k=0 ; k<N ; k++) sum += A[(i * N) + k] * B[(k * N) + j]; C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]); } #pragma omp master { _time[HOST][KERNEL] += (thread_time() - start); } } // omp parallel return; } void check(MyData *const restrict host_array, MyData *const restrict dev_array) { int flag = 0; for (size_t i=0 ; i<SIZE ; i++) flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK \n"); else printf("\n\t Result wrong \n"); return; } int main() { // Host allocation MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + SIZE; MyData *const restrict C_CPU = B + SIZE; MyData *const restrict C_GPU = C_CPU + SIZE; // Spawning 2 host threads #pragma omp parallel num_threads(2) { // Evaluate the Dgemm on the host #pragma omp single nowait { // allowing nested parallelism omp_set_max_active_levels(2); for (int loop=0 ; loop<LOOP ; loop++) { InitHost(A, B, C_CPU, &thr[0]); HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]); } } // omp single #pragma omp single nowait { // Initialize cuBLAS library cublasHandle_t handle; cublasCreate(&handle); // Allocate A, B, C on the device using CUDA API MyData *d_buffer = NULL; cudaMalloc((void **)&d_buffer, (3 * SIZE * sizeof(MyData))); assert(d_buffer != NULL); MyData *const restrict d_A = d_buffer; MyData *const restrict d_B = d_A + SIZE; MyData *const restrict d_C = d_B + SIZE; // get the default device const int dev = omp_get_default_device(); // Associate external device pointers omp_target_associate_ptr(A, d_A, (SIZE * sizeof(MyData)), 0, dev); omp_target_associate_ptr(B, d_B, (SIZE * sizeof(MyData)), 0, dev); omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev); for (int loop=0 ; loop<LOOP ; loop++) { // Init device with blocking omp target directive InitDev(A, B, C_GPU); // Apply DGEMM using device pointers directly const MyData alpha = ALPHA; const MyData beta = BETA; double start = thread_time(); cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); // CUDA synchronization point cudaDeviceSynchronize(); _time[DEV][KERNEL] += (thread_time() - start); // Fetch data from the device and deallocate start = thread_time(); cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); _time[DEV][DATA] += (thread_time() - start); } // LOOP // release pointer association omp_target_disassociate_ptr(A, dev); omp_target_disassociate_ptr(B, dev); omp_target_disassociate_ptr(C_GPU, dev); // deallocate device's memory cudaFree(d_buffer); cublasDestroy(handle); } // omp single } // synchronization point check(C_CPU, C_GPU); free(buffer); printf("\n\t Matrix size: %d x %d\n", N, N); printf("\n\t Host execution time:"); printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]); printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]); printf("\n\t Device execution time:"); printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP); printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP); printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP); return 0; } cuda-omp/hybrid/hybrid_cuda_omp.c→cuda-omp/hybrid/hybrid_omp_cublas.c +268 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Passing OpenMP data to cuBlas. // Passing OpenMP data to foreign runtime (cuBLAS library). // // Author: David Goz // mail : david.goz@inaf.it Loading @@ -8,11 +8,16 @@ // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // - using nvc // $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas // - using clang // $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda // hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas // // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./hybrid_cuda_omp // $ ./hybrid_omp_cuda //////////////////////////////////////////////////////////////////////////////////////////////////// #include <cuda_runtime.h> Loading @@ -22,19 +27,61 @@ #include <math.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <omp.h> #define N 512 #define N 2048 #define SIZE ((N) * (N)) #define ALPHA 1.0 #define BETA 0.0 #define HOST 0 #define DEV 1 #define LOOP 10 #define INIT 0 #define KERNEL 1 #define DATA 2 typedef double MyData; static double _time[2][3]; static int thr[2]; double process_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } double thread_time() { struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void InitHost(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) MyData *const restrict C, int *const restrict thr) { double start; #pragma omp parallel { #pragma omp barrier #pragma omp master { //#pragma omp parallel for collapse(2) *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { Loading @@ -42,12 +89,22 @@ void InitHost(MyData *const restrict A, B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } #pragma omp master { _time[HOST][INIT] += (thread_time() - start); } } // omp parallel return; } void InitDev(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { const double start = thread_time(); #pragma omp target teams loop collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) Loading @@ -57,6 +114,8 @@ void InitDev(MyData *const restrict A, C[(i * N) + j] = 0.0; } _time[DEV][INIT] += (thread_time() - start); return; } Loading @@ -64,12 +123,25 @@ void HostDgemm(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, const MyData alpha, const MyData beta) const MyData beta, int *const restrict thr) { // C = alpha * A * B + beta * C; double start; // naive calculation // #pragma omp parallel for collapse(2) #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { Loading @@ -80,6 +152,12 @@ void HostDgemm(MyData *const restrict A, C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]); } #pragma omp master { _time[HOST][KERNEL] += (thread_time() - start); } } // omp parallel return; } Loading @@ -91,9 +169,9 @@ void check(MyData *const restrict host_array, flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK"); printf("\n\t Result OK \n"); else printf("\n\t Result wrong"); printf("\n\t Result wrong \n"); return; } Loading @@ -114,8 +192,14 @@ int main() // Evaluate the Dgemm on the host #pragma omp single nowait { InitHost(A, B, CC); HostDgemm(A, B, CC, ALPHA, BETA); // allowing nested parallelism omp_set_max_active_levels(2); for (int loop=0 ; loop<LOOP ; loop++) { InitHost(A, B, CC, &thr[0]); HostDgemm(A, B, CC, ALPHA, BETA, &thr[1]); } } // omp single #pragma omp single nowait Loading @@ -127,25 +211,39 @@ int main() // Allocate A, B, C on the device #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE]) for (int loop=0 ; loop<LOOP ; loop++) { // Init device with blocking omp target directive InitDev(A, B, C); // Define a target data region where A, B, and C pointers // refer to device's address space #pragma omp target data use_device_addr(A, B, C) #pragma omp target data use_device_ptr(A, B, C) { MyData const alpha = ALPHA; MyData const beta = BETA; const MyData alpha = ALPHA; const MyData beta = BETA; const double start = thread_time(); cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, A, N, B, N, &beta, C, N); &alpha, A, N, B, N, &beta, C, N); // CUDA synchronization point cudaDeviceSynchronize(); _time[DEV][KERNEL] += (thread_time() - start); } // Fetch data from the device and deallocate #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE]) const double start = thread_time(); #pragma omp target update from (C[0:SIZE]) _time[DEV][DATA] += (thread_time() - start); } // LOOP // deallocate device's memory #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C[0:SIZE]) cublasDestroy(handle); } // omp single Loading @@ -155,5 +253,16 @@ int main() free(buffer); printf("\n\t Matrix size: %d x %d\n", N, N); printf("\n\t Host execution time:"); printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]); printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]); printf("\n\t Device execution time:"); printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP); printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP); printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP); return 0; } cuda-omp/hybrid/hybrid_omp_cuda.cu 0 → 100644 +281 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Passing OpenMP data to foreign runtime (cuBLAS library). // // Author: David Goz // mail : david.goz@inaf.it // date : 03.09.2024 // code tested using nvhpc // // - Compile the code: // - using nvc // $ nvc++ -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_omp_cuda.cu -o hybrid_omp_cuda -lm && ./hybrid_omp_cuda // // - using clang // $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda // hybrid_omp_cuda.c -o hybrid_omp_cuda -lm // // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./hybrid_omp_cuda //////////////////////////////////////////////////////////////////////////////////////////////////// #include <cuda.h> #include <math.h> #include <assert.h> #include <float.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <omp.h> #define N 1024 #define SIZE ((N) * (N)) #define HOST 0 #define DEV 1 #define LOOP 10 #define INIT 0 #define KERNEL 1 #define DATA 2 #define BLOCKSIZE 1024 typedef double MyData; static double _time[2][3]; static int thr[2]; double process_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } double thread_time() { struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void InitHost(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, int *const restrict thr) { double start; #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = (1.0 / M_PI); C[(i * N) + j] = 0.0; } #pragma omp master { _time[HOST][INIT] += (thread_time() - start); } } // omp parallel return; } void InitDev(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { const double start = thread_time(); #pragma omp target teams loop collapse(2) is_device_ptr(A, B, C) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = (1.0 / M_PI); C[(i * N) + j] = 0.0; } _time[DEV][INIT] += (thread_time() - start); return; } void HostMM(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, int *const restrict thr) { // C = alpha * A * B + beta * C; double start; // naive calculation #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { MyData sum = 0.0; for (int k=0 ; k<N ; k++) sum += A[(i * N) + k] * B[(k * N) + j]; C[(i * N) + j] = sum; } #pragma omp master { _time[HOST][KERNEL] += (thread_time() - start); } } // omp parallel return; } __global__ void DevMM(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, const int n) { const int size = (n * n); const int globalID = threadIdx.x + (blockIdx.x * blockDim.x); if (globalID >= size) return; const int i = (globalID / N); const int j = (globalID % N); MyData sum = 0.0; for (int k=0 ; k<N ; k++) sum += (A[(i * N) + k] * B[(k * N) + j]); C[(i * N) + j] = sum; return; } void check(MyData *const restrict host_array, MyData *const restrict dev_array) { int flag = 0; for (size_t i=0 ; i<SIZE ; i++) flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK \n"); else printf("\n\t Result wrong \n"); return; } int main() { // Host allocation MyData *h_buffer = (MyData *)malloc(2 * SIZE * sizeof(MyData)); assert(h_buffer != NULL); MyData *const restrict C_HOST = h_buffer; MyData *const restrict C_DEV = C_HOST + SIZE; // Spawning 2 host threads #pragma omp parallel num_threads(2) { // Evaluate the Dgemm on the host #pragma omp single nowait { // allowing nested parallelism omp_set_max_active_levels(2); MyData *tmp = (MyData *)malloc(2 * SIZE * sizeof(MyData)); MyData *const restrict A = tmp; MyData *const restrict B = A + SIZE; for (int loop=0 ; loop<LOOP ; loop++) { InitHost(A, B, C_HOST, &thr[0]); HostMM(A, B, C_HOST, &thr[1]); } free(tmp); } // omp single #pragma omp single nowait { // Device allocation const int dev = omp_get_default_device(); const int host = omp_get_initial_device(); MyData *d_buffer = (MyData *)omp_target_alloc((3 * SIZE * sizeof(MyData)), dev); assert(d_buffer != NULL); MyData *const restrict d_A = d_buffer; MyData *const restrict d_B = d_A + SIZE; MyData *const restrict d_C = d_B + SIZE; const dim3 nblock = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1}; const dim3 block = {BLOCKSIZE, 1, 1}; for (int loop=0 ; loop<LOOP ; loop++) { // Init device with blocking omp target directive InitDev(d_A, d_B, d_C); double start = thread_time(); DevMM<<< nblock, block >>>(d_A, d_B, d_C, N); // CUDA synchronization point cudaDeviceSynchronize(); _time[DEV][KERNEL] += (thread_time() - start); // Fetch data from the device and deallocate start = thread_time(); omp_target_memcpy(C_DEV, d_C, (SIZE * sizeof(MyData)), 0, 0, host, dev); _time[DEV][DATA] += (thread_time() - start); } // LOOP // deallocate device's memory omp_target_free(d_buffer, dev); } // omp single } // synchronization point check(C_HOST, C_DEV); free(h_buffer); printf("\n\t Matrix size: %d x %d\n", N, N); printf("\n\t Host execution time:"); printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]); printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]); printf("\n\t Device execution time:"); printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP); printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP); printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP); return 0; } cuda-omp/omp/miscellaneous/host_device_address_space.c 0 → 100644 +76 −0 Original line number Diff line number Diff line ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 03.09.2024 // code tested using nvhpc // // - Compile the code: // - nvc complains about the '#pragma omp target has_device_addr' // - clang works ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <string.h> #include <assert.h> static int answer = 42; #define SIZE 3 int main() { int *A = (int *)malloc(SIZE * sizeof(int)); assert(A != NULL); memset(A, 0, (SIZE * sizeof(int))); printf("\n\t Host address of 'answer' [value: %d] is: %p", answer, &answer); printf("\n\t Host address of 'A' is: %p", A); // Map 'answer' to the device #pragma omp target data map(alloc: answer, A[0:SIZE]) { #pragma omp target update to(answer, A[0:SIZE]) // Update 'answer' on the device #pragma omp target map(answer) { answer += 1; for (int i=0 ; i<SIZE ; i++) A[i] += (i + 1); } #pragma omp target update from(answer, A[0:SIZE]) printf("\n\t Address of 'answer' [value: %d] in target data region is: %p", answer, &answer); printf("\n\t Address of 'A' in target data region is: %p", A); printf("\n\t Value of A:"); for (int i=0 ; i<SIZE ; i++) printf("\n\t\t A[%d] = %d", i, A[i]); printf("\n"); #pragma omp target data use_device_addr(answer) use_device_ptr(A) { #pragma omp target has_device_addr(answer) { answer += 1; for (int i=0 ; i<SIZE ; i++) A[i] += (i + 1); } printf("\n\t Device address of 'answer' is: %p", &answer); printf("\n\t Device address of 'A' is: %p", A); } #pragma omp target update from(answer, A[0:SIZE]) printf("\n\t Last value of 'answer' is: %d", answer); printf("\n\t Last value if 'A' is:"); for (int i=0 ; i<SIZE ; i++) printf("\n\t\t A[%d] = %d", i, A[i]); printf("\n\n"); } // omp target data free(A); return 0; } Loading
cuda-omp/cuda/7/mat_mult_block.cu +2 −2 Original line number Diff line number Diff line Loading @@ -257,9 +257,9 @@ int main() const dim3 block = {(BLOCK * BLOCK), 1, 1}; /////////////////////////// GPU naive block algorithm //////////////////////////////////////// time = 0.0; GPU_mat_mult_block<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up cudaDeviceSynchronize(); time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); Loading @@ -274,9 +274,9 @@ int main() ///////////////////////////////////////////////////////////////////////////////////// /////////////////////////// GPU block shared memory algorithm /////////////////////// time = 0.0; GPU_mat_mult_block_shared<<< nblocks, block >>>(A_GPU, B_GPU, C_GPU, N); // warm-up cudaDeviceSynchronize(); time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); Loading
cuda-omp/hybrid/hybrid_cublas_omp.c 0 → 100644 +282 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Using CUDA data in OpenMP // // Author: David Goz // mail : david.goz@inaf.it // date : 03.09.2024 // code tested using nvhpc // // - Compile the code: // - using nvc // $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // - using clang // $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./hybrid_cuda_omp //////////////////////////////////////////////////////////////////////////////////////////////////// #include <cuda_runtime.h> #include <cublas_v2.h> #include <assert.h> #include <float.h> #include <math.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <omp.h> #define N 2048 #define SIZE ((N) * (N)) #define ALPHA 1.0 #define BETA 0.0 #define HOST 0 #define DEV 1 #define LOOP 10 #define INIT 0 #define KERNEL 1 #define DATA 2 typedef double MyData; static double _time[2][3]; static int thr[2]; double process_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } double thread_time() { struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void InitHost(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, int *const restrict thr) { double start; #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } #pragma omp master { _time[HOST][INIT] += (thread_time() - start); } } // omp parallel return; } void InitDev(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { const double start = thread_time(); #pragma omp target teams loop collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } _time[DEV][INIT] += (thread_time() - start); return; } void HostDgemm(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, const MyData alpha, const MyData beta, int *const restrict thr) { // C = alpha * A * B + beta * C; double start; // naive calculation #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { MyData sum = 0.0; for (int k=0 ; k<N ; k++) sum += A[(i * N) + k] * B[(k * N) + j]; C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]); } #pragma omp master { _time[HOST][KERNEL] += (thread_time() - start); } } // omp parallel return; } void check(MyData *const restrict host_array, MyData *const restrict dev_array) { int flag = 0; for (size_t i=0 ; i<SIZE ; i++) flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK \n"); else printf("\n\t Result wrong \n"); return; } int main() { // Host allocation MyData *buffer = (MyData *)malloc(4 * SIZE * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + SIZE; MyData *const restrict C_CPU = B + SIZE; MyData *const restrict C_GPU = C_CPU + SIZE; // Spawning 2 host threads #pragma omp parallel num_threads(2) { // Evaluate the Dgemm on the host #pragma omp single nowait { // allowing nested parallelism omp_set_max_active_levels(2); for (int loop=0 ; loop<LOOP ; loop++) { InitHost(A, B, C_CPU, &thr[0]); HostDgemm(A, B, C_CPU, ALPHA, BETA, &thr[1]); } } // omp single #pragma omp single nowait { // Initialize cuBLAS library cublasHandle_t handle; cublasCreate(&handle); // Allocate A, B, C on the device using CUDA API MyData *d_buffer = NULL; cudaMalloc((void **)&d_buffer, (3 * SIZE * sizeof(MyData))); assert(d_buffer != NULL); MyData *const restrict d_A = d_buffer; MyData *const restrict d_B = d_A + SIZE; MyData *const restrict d_C = d_B + SIZE; // get the default device const int dev = omp_get_default_device(); // Associate external device pointers omp_target_associate_ptr(A, d_A, (SIZE * sizeof(MyData)), 0, dev); omp_target_associate_ptr(B, d_B, (SIZE * sizeof(MyData)), 0, dev); omp_target_associate_ptr(C_GPU, d_C, (SIZE * sizeof(MyData)), 0, dev); for (int loop=0 ; loop<LOOP ; loop++) { // Init device with blocking omp target directive InitDev(A, B, C_GPU); // Apply DGEMM using device pointers directly const MyData alpha = ALPHA; const MyData beta = BETA; double start = thread_time(); cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); // CUDA synchronization point cudaDeviceSynchronize(); _time[DEV][KERNEL] += (thread_time() - start); // Fetch data from the device and deallocate start = thread_time(); cudaMemcpy(C_GPU, d_C, (SIZE * sizeof(MyData)), cudaMemcpyDeviceToHost); _time[DEV][DATA] += (thread_time() - start); } // LOOP // release pointer association omp_target_disassociate_ptr(A, dev); omp_target_disassociate_ptr(B, dev); omp_target_disassociate_ptr(C_GPU, dev); // deallocate device's memory cudaFree(d_buffer); cublasDestroy(handle); } // omp single } // synchronization point check(C_CPU, C_GPU); free(buffer); printf("\n\t Matrix size: %d x %d\n", N, N); printf("\n\t Host execution time:"); printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]); printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]); printf("\n\t Device execution time:"); printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP); printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP); printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP); return 0; }
cuda-omp/hybrid/hybrid_cuda_omp.c→cuda-omp/hybrid/hybrid_omp_cublas.c +268 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Passing OpenMP data to cuBlas. // Passing OpenMP data to foreign runtime (cuBLAS library). // // Author: David Goz // mail : david.goz@inaf.it Loading @@ -8,11 +8,16 @@ // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_cuda_omp.c -o hybrid_cuda_omp -lm -lcudart -lcublas // - using nvc // $ nvc -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas // - using clang // $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda // hybrid_omp_cuda.c -o hybrid_omp_cuda -lm -lcudart -lcublas // // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./hybrid_cuda_omp // $ ./hybrid_omp_cuda //////////////////////////////////////////////////////////////////////////////////////////////////// #include <cuda_runtime.h> Loading @@ -22,19 +27,61 @@ #include <math.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <omp.h> #define N 512 #define N 2048 #define SIZE ((N) * (N)) #define ALPHA 1.0 #define BETA 0.0 #define HOST 0 #define DEV 1 #define LOOP 10 #define INIT 0 #define KERNEL 1 #define DATA 2 typedef double MyData; static double _time[2][3]; static int thr[2]; double process_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } double thread_time() { struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void InitHost(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) MyData *const restrict C, int *const restrict thr) { double start; #pragma omp parallel { #pragma omp barrier #pragma omp master { //#pragma omp parallel for collapse(2) *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { Loading @@ -42,12 +89,22 @@ void InitHost(MyData *const restrict A, B[(i * N) + j] = 2.0; C[(i * N) + j] = 0.0; } #pragma omp master { _time[HOST][INIT] += (thread_time() - start); } } // omp parallel return; } void InitDev(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { const double start = thread_time(); #pragma omp target teams loop collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) Loading @@ -57,6 +114,8 @@ void InitDev(MyData *const restrict A, C[(i * N) + j] = 0.0; } _time[DEV][INIT] += (thread_time() - start); return; } Loading @@ -64,12 +123,25 @@ void HostDgemm(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, const MyData alpha, const MyData beta) const MyData beta, int *const restrict thr) { // C = alpha * A * B + beta * C; double start; // naive calculation // #pragma omp parallel for collapse(2) #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { Loading @@ -80,6 +152,12 @@ void HostDgemm(MyData *const restrict A, C[(i * N) + j] = (alpha * sum) + (beta * C[(i * N) + j]); } #pragma omp master { _time[HOST][KERNEL] += (thread_time() - start); } } // omp parallel return; } Loading @@ -91,9 +169,9 @@ void check(MyData *const restrict host_array, flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK"); printf("\n\t Result OK \n"); else printf("\n\t Result wrong"); printf("\n\t Result wrong \n"); return; } Loading @@ -114,8 +192,14 @@ int main() // Evaluate the Dgemm on the host #pragma omp single nowait { InitHost(A, B, CC); HostDgemm(A, B, CC, ALPHA, BETA); // allowing nested parallelism omp_set_max_active_levels(2); for (int loop=0 ; loop<LOOP ; loop++) { InitHost(A, B, CC, &thr[0]); HostDgemm(A, B, CC, ALPHA, BETA, &thr[1]); } } // omp single #pragma omp single nowait Loading @@ -127,25 +211,39 @@ int main() // Allocate A, B, C on the device #pragma omp target enter data map(alloc: A[0:SIZE], B[0:SIZE], C[0:SIZE]) for (int loop=0 ; loop<LOOP ; loop++) { // Init device with blocking omp target directive InitDev(A, B, C); // Define a target data region where A, B, and C pointers // refer to device's address space #pragma omp target data use_device_addr(A, B, C) #pragma omp target data use_device_ptr(A, B, C) { MyData const alpha = ALPHA; MyData const beta = BETA; const MyData alpha = ALPHA; const MyData beta = BETA; const double start = thread_time(); cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, A, N, B, N, &beta, C, N); &alpha, A, N, B, N, &beta, C, N); // CUDA synchronization point cudaDeviceSynchronize(); _time[DEV][KERNEL] += (thread_time() - start); } // Fetch data from the device and deallocate #pragma omp target exit data map(from: C[0:SIZE]) map(delete: A[0:SIZE], B[0:SIZE]) const double start = thread_time(); #pragma omp target update from (C[0:SIZE]) _time[DEV][DATA] += (thread_time() - start); } // LOOP // deallocate device's memory #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C[0:SIZE]) cublasDestroy(handle); } // omp single Loading @@ -155,5 +253,16 @@ int main() free(buffer); printf("\n\t Matrix size: %d x %d\n", N, N); printf("\n\t Host execution time:"); printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]); printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]); printf("\n\t Device execution time:"); printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP); printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP); printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP); return 0; }
cuda-omp/hybrid/hybrid_omp_cuda.cu 0 → 100644 +281 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////////// // // Passing OpenMP data to foreign runtime (cuBLAS library). // // Author: David Goz // mail : david.goz@inaf.it // date : 03.09.2024 // code tested using nvhpc // // - Compile the code: // - using nvc // $ nvc++ -O3 -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v // hybrid_omp_cuda.cu -o hybrid_omp_cuda -lm && ./hybrid_omp_cuda // // - using clang // $ clang -O3 -v -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda // hybrid_omp_cuda.c -o hybrid_omp_cuda -lm // // - Run the code: // $ export OMP_TARGET_OFFLOAD=mandatory // $ ./hybrid_omp_cuda //////////////////////////////////////////////////////////////////////////////////////////////////// #include <cuda.h> #include <math.h> #include <assert.h> #include <float.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <omp.h> #define N 1024 #define SIZE ((N) * (N)) #define HOST 0 #define DEV 1 #define LOOP 10 #define INIT 0 #define KERNEL 1 #define DATA 2 #define BLOCKSIZE 1024 typedef double MyData; static double _time[2][3]; static int thr[2]; double process_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } double thread_time() { struct timespec ts; clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void InitHost(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, int *const restrict thr) { double start; #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = (1.0 / M_PI); C[(i * N) + j] = 0.0; } #pragma omp master { _time[HOST][INIT] += (thread_time() - start); } } // omp parallel return; } void InitDev(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C) { const double start = thread_time(); #pragma omp target teams loop collapse(2) is_device_ptr(A, B, C) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { A[(i * N) + j] = 1.0; B[(i * N) + j] = (1.0 / M_PI); C[(i * N) + j] = 0.0; } _time[DEV][INIT] += (thread_time() - start); return; } void HostMM(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, int *const restrict thr) { // C = alpha * A * B + beta * C; double start; // naive calculation #pragma omp parallel { #pragma omp barrier #pragma omp master { *thr = omp_get_num_threads(); start = thread_time(); } #pragma omp barrier #pragma omp for collapse(2) for (int i=0 ; i<N ; i++) for (int j=0 ; j<N ; j++) { MyData sum = 0.0; for (int k=0 ; k<N ; k++) sum += A[(i * N) + k] * B[(k * N) + j]; C[(i * N) + j] = sum; } #pragma omp master { _time[HOST][KERNEL] += (thread_time() - start); } } // omp parallel return; } __global__ void DevMM(MyData *const restrict A, MyData *const restrict B, MyData *const restrict C, const int n) { const int size = (n * n); const int globalID = threadIdx.x + (blockIdx.x * blockDim.x); if (globalID >= size) return; const int i = (globalID / N); const int j = (globalID % N); MyData sum = 0.0; for (int k=0 ; k<N ; k++) sum += (A[(i * N) + k] * B[(k * N) + j]); C[(i * N) + j] = sum; return; } void check(MyData *const restrict host_array, MyData *const restrict dev_array) { int flag = 0; for (size_t i=0 ; i<SIZE ; i++) flag = ((fabs(host_array[i] - dev_array[i]) > FLT_EPSILON) ? 1 : flag); if (!flag) printf("\n\t Result OK \n"); else printf("\n\t Result wrong \n"); return; } int main() { // Host allocation MyData *h_buffer = (MyData *)malloc(2 * SIZE * sizeof(MyData)); assert(h_buffer != NULL); MyData *const restrict C_HOST = h_buffer; MyData *const restrict C_DEV = C_HOST + SIZE; // Spawning 2 host threads #pragma omp parallel num_threads(2) { // Evaluate the Dgemm on the host #pragma omp single nowait { // allowing nested parallelism omp_set_max_active_levels(2); MyData *tmp = (MyData *)malloc(2 * SIZE * sizeof(MyData)); MyData *const restrict A = tmp; MyData *const restrict B = A + SIZE; for (int loop=0 ; loop<LOOP ; loop++) { InitHost(A, B, C_HOST, &thr[0]); HostMM(A, B, C_HOST, &thr[1]); } free(tmp); } // omp single #pragma omp single nowait { // Device allocation const int dev = omp_get_default_device(); const int host = omp_get_initial_device(); MyData *d_buffer = (MyData *)omp_target_alloc((3 * SIZE * sizeof(MyData)), dev); assert(d_buffer != NULL); MyData *const restrict d_A = d_buffer; MyData *const restrict d_B = d_A + SIZE; MyData *const restrict d_C = d_B + SIZE; const dim3 nblock = {((SIZE + BLOCKSIZE - 1) / BLOCKSIZE), 1, 1}; const dim3 block = {BLOCKSIZE, 1, 1}; for (int loop=0 ; loop<LOOP ; loop++) { // Init device with blocking omp target directive InitDev(d_A, d_B, d_C); double start = thread_time(); DevMM<<< nblock, block >>>(d_A, d_B, d_C, N); // CUDA synchronization point cudaDeviceSynchronize(); _time[DEV][KERNEL] += (thread_time() - start); // Fetch data from the device and deallocate start = thread_time(); omp_target_memcpy(C_DEV, d_C, (SIZE * sizeof(MyData)), 0, 0, host, dev); _time[DEV][DATA] += (thread_time() - start); } // LOOP // deallocate device's memory omp_target_free(d_buffer, dev); } // omp single } // synchronization point check(C_HOST, C_DEV); free(h_buffer); printf("\n\t Matrix size: %d x %d\n", N, N); printf("\n\t Host execution time:"); printf("\n\t\t Init : %lg [s] - threads: %d", _time[HOST][INIT]/LOOP, thr[0]); printf("\n\t\t Dgemm : %lg [s] - threads: %d\n", _time[HOST][KERNEL]/LOOP, thr[1]); printf("\n\t Device execution time:"); printf("\n\t\t Init : %lg [s]", _time[DEV][INIT]/LOOP); printf("\n\t\t Dgemm : %lg [s]", _time[DEV][KERNEL]/LOOP); printf("\n\t\t Fetch data: %lg [s]\n\n", _time[DEV][DATA]/LOOP); return 0; }
cuda-omp/omp/miscellaneous/host_device_address_space.c 0 → 100644 +76 −0 Original line number Diff line number Diff line ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 03.09.2024 // code tested using nvhpc // // - Compile the code: // - nvc complains about the '#pragma omp target has_device_addr' // - clang works ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <string.h> #include <assert.h> static int answer = 42; #define SIZE 3 int main() { int *A = (int *)malloc(SIZE * sizeof(int)); assert(A != NULL); memset(A, 0, (SIZE * sizeof(int))); printf("\n\t Host address of 'answer' [value: %d] is: %p", answer, &answer); printf("\n\t Host address of 'A' is: %p", A); // Map 'answer' to the device #pragma omp target data map(alloc: answer, A[0:SIZE]) { #pragma omp target update to(answer, A[0:SIZE]) // Update 'answer' on the device #pragma omp target map(answer) { answer += 1; for (int i=0 ; i<SIZE ; i++) A[i] += (i + 1); } #pragma omp target update from(answer, A[0:SIZE]) printf("\n\t Address of 'answer' [value: %d] in target data region is: %p", answer, &answer); printf("\n\t Address of 'A' in target data region is: %p", A); printf("\n\t Value of A:"); for (int i=0 ; i<SIZE ; i++) printf("\n\t\t A[%d] = %d", i, A[i]); printf("\n"); #pragma omp target data use_device_addr(answer) use_device_ptr(A) { #pragma omp target has_device_addr(answer) { answer += 1; for (int i=0 ; i<SIZE ; i++) A[i] += (i + 1); } printf("\n\t Device address of 'answer' is: %p", &answer); printf("\n\t Device address of 'A' is: %p", A); } #pragma omp target update from(answer, A[0:SIZE]) printf("\n\t Last value of 'answer' is: %d", answer); printf("\n\t Last value if 'A' is:"); for (int i=0 ; i<SIZE ; i++) printf("\n\t\t A[%d] = %d", i, A[i]); printf("\n\n"); } // omp target data free(A); return 0; }