Loading cuda-omp/omp/miscellaneous/multiple_devices.c 0 → 100644 +132 −0 Original line number Diff line number Diff line #include <stdio.h> #include <stdlib.h> #include <omp.h> #include <assert.h> typedef int MyData; #define N_PER_DEV 1000000 #define BLOCKSIZE 256 #if (BLOCKSIZE < 32) || (BLOCKSIZE > 1024) #error "32 <= BLOCKSIZE <= 1024" #endif #if (N_PER_DEV < BLOCKSIZE) #error "N_PER_DEV < BLOCKSIZE" #endif #define NDEBUG void check(const MyData *const restrict vector_cpu, const MyData *const restrict vector_gpu, const size_t size) { int flag = 0; for (size_t i=0 ; i<size ; i++) { #if !defined(NDEBUG) printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d", i, vector_cpu[i], i, vector_gpu[i]); #endif flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag); } if (flag) printf("\n\t Result wrong \n"); else printf("\n\t Result OK \n"); return; } void VectorAdd(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const int offset, const int size, const int dev, const int nblocks) { #pragma omp target \ teams num_teams(nblocks) thread_limit(BLOCKSIZE) \ map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \ device(dev) { const int team = omp_get_team_num(); const int team_start_index = (team * BLOCKSIZE) + offset; const int team_end_index = team_start_index + BLOCKSIZE; #pragma omp parallel num_threads(BLOCKSIZE) { const int localID = omp_get_thread_num(); const int block = omp_get_num_threads(); int globalID = team_start_index + localID; for (int index=globalID ; index<team_end_index ; index+=block) C[index] = A[index] + B[index]; #if !defined(NDEBUG) if ((localID == 0) && (team == 0)) printf("\n\t Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]", dev, omp_get_num_teams(), nblocks, block, BLOCKSIZE); #endif } // omp parallel } // omp target return; } int main() { // get the number of the available devices const int NumDev = omp_get_num_devices(); // global vector size const int size = (NumDev * N_PER_DEV); assert(size > 0); MyData *buffer = (MyData *)malloc(4 * size * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + size; MyData *const restrict C_CPU = B + size; MyData *const restrict C_GPU = C_CPU + size; #pragma omp parallel for simd for (int i=0 ; i<size ; i++) { A[i] = rand() % N_PER_DEV; B[i] = rand() % N_PER_DEV; C_CPU[i] = A[i] + B[i]; } #pragma omp parallel num_threads(NumDev) { // check #pragma omp single { if (NumDev != omp_get_num_threads()) exit(EXIT_FAILURE); else { printf("\n\t Using %d GPUs \n", NumDev); fflush(stdout); } } // implicit barrier const int tid = omp_get_thread_num(); const int offset = (tid * N_PER_DEV); const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks); } // omp parallel check(C_CPU, C_GPU, size); free(buffer); return 0; } Loading
cuda-omp/omp/miscellaneous/multiple_devices.c 0 → 100644 +132 −0 Original line number Diff line number Diff line #include <stdio.h> #include <stdlib.h> #include <omp.h> #include <assert.h> typedef int MyData; #define N_PER_DEV 1000000 #define BLOCKSIZE 256 #if (BLOCKSIZE < 32) || (BLOCKSIZE > 1024) #error "32 <= BLOCKSIZE <= 1024" #endif #if (N_PER_DEV < BLOCKSIZE) #error "N_PER_DEV < BLOCKSIZE" #endif #define NDEBUG void check(const MyData *const restrict vector_cpu, const MyData *const restrict vector_gpu, const size_t size) { int flag = 0; for (size_t i=0 ; i<size ; i++) { #if !defined(NDEBUG) printf("\n\t vector_cpu[%zu] = %d - vector_gpu[%zu] = %d", i, vector_cpu[i], i, vector_gpu[i]); #endif flag = ((vector_cpu[i] != vector_gpu[i]) ? 1 : flag); } if (flag) printf("\n\t Result wrong \n"); else printf("\n\t Result OK \n"); return; } void VectorAdd(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const int offset, const int size, const int dev, const int nblocks) { #pragma omp target \ teams num_teams(nblocks) thread_limit(BLOCKSIZE) \ map(to: A[offset:size], B[offset:size]) map(from: C[offset:size]) \ device(dev) { const int team = omp_get_team_num(); const int team_start_index = (team * BLOCKSIZE) + offset; const int team_end_index = team_start_index + BLOCKSIZE; #pragma omp parallel num_threads(BLOCKSIZE) { const int localID = omp_get_thread_num(); const int block = omp_get_num_threads(); int globalID = team_start_index + localID; for (int index=globalID ; index<team_end_index ; index+=block) C[index] = A[index] + B[index]; #if !defined(NDEBUG) if ((localID == 0) && (team == 0)) printf("\n\t Device: %d - Teams: %d [requested: %d]- Thread per team: %d [requested: %d]", dev, omp_get_num_teams(), nblocks, block, BLOCKSIZE); #endif } // omp parallel } // omp target return; } int main() { // get the number of the available devices const int NumDev = omp_get_num_devices(); // global vector size const int size = (NumDev * N_PER_DEV); assert(size > 0); MyData *buffer = (MyData *)malloc(4 * size * sizeof(MyData)); assert(buffer != NULL); MyData *const restrict A = buffer; MyData *const restrict B = A + size; MyData *const restrict C_CPU = B + size; MyData *const restrict C_GPU = C_CPU + size; #pragma omp parallel for simd for (int i=0 ; i<size ; i++) { A[i] = rand() % N_PER_DEV; B[i] = rand() % N_PER_DEV; C_CPU[i] = A[i] + B[i]; } #pragma omp parallel num_threads(NumDev) { // check #pragma omp single { if (NumDev != omp_get_num_threads()) exit(EXIT_FAILURE); else { printf("\n\t Using %d GPUs \n", NumDev); fflush(stdout); } } // implicit barrier const int tid = omp_get_thread_num(); const int offset = (tid * N_PER_DEV); const int nblocks = ((N_PER_DEV + BLOCKSIZE - 1) / BLOCKSIZE); VectorAdd(A, B, C_GPU, offset, N_PER_DEV, tid, nblocks); } // omp parallel check(C_CPU, C_GPU, size); free(buffer); return 0; }