Loading cuda-omp/omp/7/mat_mult.c 0 → 100644 +185 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////// // - Naive matrix multiplication algorithm // for (size_t i=0 ; i<N ; i++) // for (size_t j=0 ; j<N ; j++) // for (size_t k=0 ; k<_N ; k++) // C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; // //////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 31.07.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp // - Run the code: // $ ./classwork_omp ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <time.h> #include <assert.h> #include <omp.h> #include <string.h> #define N 512 #define SIZE (N * N) // matrix size typedef double MyData; // do not change #define LOOP 100 #define NDEBUG double wall_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void CPU_mat_mult(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { for (size_t i=0 ; i<size ; i++) for (size_t j=0 ; j<size ; j++) for (size_t k=0 ; k<size ; k++) C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); return; } void GPU_mat_mult(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { #pragma omp target { #pragma omp teams distribute num_teams(size) for (size_t i=0 ; i<size ; i++) { #pragma omp parallel for num_threads(size) for (size_t j=0 ; j<size ; j++) { MyData value = (MyData)0; for (size_t k=0 ; k<size ; k++) value += (A[(i * size) + k] * B[(k * size) + j]); C[(i * size) + j] = value; } // omp thread } // omp teams } // omp target return; } void GPU_mat_mult_no_loops(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { #pragma omp target { #pragma omp teams num_teams(size) { const size_t team_size = (size * omp_get_team_num()); #pragma omp parallel firstprivate(team_size) num_threads(size) { const size_t tid = omp_get_thread_num(); MyData value = (MyData)0; for (size_t k=0 ; k<size ; k++) value += (A[team_size + k] * B[(k * size) + tid]); C[team_size + tid] = value; } // omp threads } // omp teams } // omp target return; } void check(const MyData *const __restrict__ cpu_matrix, const MyData *const __restrict__ gpu_matrix) { int flag; for (size_t i=0 ; i<SIZE ; i++) flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); if (!flag) printf("\n\t Result OK"); else printf("\n\t Result wrong"); return; } int main() { double time; MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData)); assert(buffer != NULL); // host reference matrix A MyData *const restrict A = buffer; MyData *const restrict B = A + SIZE; MyData *const restrict C_CPU = B + SIZE; MyData *const restrict C_GPU = C_CPU + SIZE; for (size_t i=0 ; i<SIZE ; i++) { A[i] = drand48(); B[i] = drand48(); } ////////////////////////// CPU naive algorithm ////////////////////////////////////////// CPU_mat_mult(A, B, C_CPU, N); ///////////////////////////////////////////////////////////////////////////////////////// // copy/alloc data to the GPU #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE]) /////////////////////////// GPU naive algorithm //////////////////////////////////////// time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); GPU_mat_mult(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); printf("\n\t GPU naive time %lg [s]\n", (time / LOOP)); //////////////////////////////////////////////////////////////////////////////// /////////////////////////// GPU naive no loops algorithm //////////////////////////// time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); GPU_mat_mult_no_loops(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP)); //////////////////////////////////////////////////////////////////////////////// // free CPU memory free(buffer); // free GPU memory #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE]) printf("\n"); return EXIT_SUCCESS; } cuda-omp/omp/7/mat_mult_block.c 0 → 100644 +208 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////// // - Block matrix multiplication algorithm // // const size_t Nblocks = (N / Bsize); // // // loop over blocks of matrix C // for (size_t ib=0 ; ib<Nblocks ; ib++) // { // for (size_t jb=0 ; jb<Nblocks ; jb++) // { // // // loop over blocks of rows of A // for (size_t kb=0 ; kb<Nblocks ; kb++) // // // // for (size_t i=0 ; i<N ; i++) // for (size_t j=0 ; j<N ; j++) // for (size_t k=0 ; k<_N ; k++) // C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; // // } // jb // } // ib // - Exploit shared-memory. //////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 31.07.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp // - Run the code: // $ ./classwork_omp ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <time.h> #include <assert.h> #include <omp.h> #include <string.h> #define N 512 #define SIZE (N * N) // matrix size typedef double MyData; // do not change #define BLOCKSIZE 32 // number of threads per block // sanity check #if BLOCKSIZE > 1024 #error BLOCKSIZE cannot be larger than 1024 #endif #define LOOP 100 #define NDEBUG double wall_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void CPU_mat_mult(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { for (size_t i=0 ; i<size ; i++) for (size_t j=0 ; j<size ; j++) for (size_t k=0 ; k<size ; k++) C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); return; } void GPU_mat_mult(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { #pragma omp target { #pragma omp teams distribute num_teams(size) for (size_t i=0 ; i<size ; i++) { #pragma omp parallel for num_threads(size) for (size_t j=0 ; j<size ; j++) { MyData value = (MyData)0; for (size_t k=0 ; k<size ; k++) value += (A[(i * size) + k] * B[(k * size) + j]); C[(i * size) + j] = value; } // omp thread } // omp teams } // omp target return; } void GPU_mat_mult_no_loops(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { #pragma omp target { #pragma omp teams num_teams(size) { const size_t team_size = (size * omp_get_team_num()); #pragma omp parallel firstprivate(team_size) num_threads(size) { const size_t tid = omp_get_thread_num(); MyData value = (MyData)0; for (size_t k=0 ; k<size ; k++) value += (A[team_size + k] * B[(k * size) + tid]); C[team_size + tid] = value; } // omp threads } // omp teams } // omp target return; } void check(const MyData *const __restrict__ cpu_matrix, const MyData *const __restrict__ gpu_matrix) { int flag; for (size_t i=0 ; i<SIZE ; i++) flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); if (!flag) printf("\n\t Result OK"); else printf("\n\t Result wrong"); return; } int main() { double time; MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData)); assert(buffer != NULL); // host reference matrix A MyData *const restrict A = buffer; MyData *const restrict B = A + SIZE; MyData *const restrict C_CPU = B + SIZE; MyData *const restrict C_GPU = C_CPU + SIZE; for (size_t i=0 ; i<SIZE ; i++) { A[i] = drand48(); B[i] = drand48(); } ////////////////////////// CPU naive algorithm ////////////////////////////////////////// CPU_mat_mult(A, B, C_CPU, N); ///////////////////////////////////////////////////////////////////////////////////////// // copy/alloc data to the GPU #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE]) /////////////////////////// GPU naive algorithm //////////////////////////////////////// time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); GPU_mat_mult(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); printf("\n\t GPU naive time %lg [s]\n", (time / LOOP)); //////////////////////////////////////////////////////////////////////////////// /////////////////////////// GPU naive no loops algorithm //////////////////////////// time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); GPU_mat_mult_no_loops(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP)); //////////////////////////////////////////////////////////////////////////////// // free CPU memory free(buffer); // free GPU memory #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE]) printf("\n"); return EXIT_SUCCESS; } cuda-omp/omp/miscellaneous/structure.c +34 −25 Original line number Diff line number Diff line Loading @@ -17,7 +17,8 @@ #include <assert.h> #include <omp.h> #define N 8 #define SIZE 8 #define SIZE_2 (SIZE / 2) typedef double MyData; typedef struct my_span Loading @@ -29,26 +30,20 @@ typedef struct my_span void allocate( span *my_struct, const size_t size) { span tmp; /* allocate the buffer on the host memory */ tmp.ptr = (MyData *)malloc(size * sizeof(MyData)); assert(tmp.ptr != NULL); tmp.N = size; /* declare how the object 'span' has to be mapped on the device */ #pragma omp declare mapper(span tmp) map(from: tmp, tmp.ptr[0: tmp.N]) my_struct->ptr = tmp.ptr; my_struct->N = tmp.N; my_struct->ptr = (MyData *)calloc(size, sizeof(MyData)); assert(my_struct->ptr != NULL); my_struct->N = size; return; } void print(const span *const A) void print(const span *const A, const char *const string) { printf("\n"); for (size_t i=0 ; i<A->N ; i++) printf("\n\t array[%i] = %lg", i, A->ptr[i]); for (int i=0 ; i<A->N ; i++) printf("\n\t %s[%d] = %lg", string, i, A->ptr[i]); printf("\n\n"); return; Loading @@ -56,27 +51,41 @@ void print(const span *const A) int main() { span A, B; span A, B, C; allocate(&A, N); allocate(&B, N); allocate(&A, SIZE); allocate(&B, SIZE); allocate(&C, SIZE); /* declare how the object 'span' has to be mapped on the device */ #pragma omp declare mapper(all : span S) map(to: S) map(from: S.ptr[0 : S.N]) #pragma omp declare mapper(left : span S) map(to: S) map(from: S.ptr[0 : S.N/2]) #pragma omp declare mapper(right: span S) map(to: S) map(from: S.ptr[S.N/2: S.N/2]) /* init on the GPU */ #pragma omp target { for (size_t i=0 ; i<N ; i++) #pragma omp target map(mapper(all): A) map(mapper(left): B) map(mapper(right): C) { #pragma omp loop for (size_t i=0 ; i<SIZE ; i++) A.ptr[i] = (MyData)(i); B.ptr[i] = (MyData)(2 * i); } #pragma omp loop for (size_t ii=0 ; ii<SIZE_2 ; ii++) B.ptr[ii] = (MyData)(ii); #pragma omp loop for (size_t iii=SIZE_2 ; iii<SIZE ; iii++) C.ptr[iii] = (MyData)(iii); } print(&A); print(&B); print(&A, "A"); print(&B, "B"); print(&C, "C"); /* free the host's memory */ free(A.ptr); free(B.ptr); free(C.ptr); return 0; } jacobi/serial/not_opt/jacobi_serial_not_opt_len14 0 → 100755 +17 KiB File added.No diff preview for this file type. View file Loading
cuda-omp/omp/7/mat_mult.c 0 → 100644 +185 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////// // - Naive matrix multiplication algorithm // for (size_t i=0 ; i<N ; i++) // for (size_t j=0 ; j<N ; j++) // for (size_t k=0 ; k<_N ; k++) // C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; // //////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 31.07.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp // - Run the code: // $ ./classwork_omp ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <time.h> #include <assert.h> #include <omp.h> #include <string.h> #define N 512 #define SIZE (N * N) // matrix size typedef double MyData; // do not change #define LOOP 100 #define NDEBUG double wall_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void CPU_mat_mult(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { for (size_t i=0 ; i<size ; i++) for (size_t j=0 ; j<size ; j++) for (size_t k=0 ; k<size ; k++) C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); return; } void GPU_mat_mult(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { #pragma omp target { #pragma omp teams distribute num_teams(size) for (size_t i=0 ; i<size ; i++) { #pragma omp parallel for num_threads(size) for (size_t j=0 ; j<size ; j++) { MyData value = (MyData)0; for (size_t k=0 ; k<size ; k++) value += (A[(i * size) + k] * B[(k * size) + j]); C[(i * size) + j] = value; } // omp thread } // omp teams } // omp target return; } void GPU_mat_mult_no_loops(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { #pragma omp target { #pragma omp teams num_teams(size) { const size_t team_size = (size * omp_get_team_num()); #pragma omp parallel firstprivate(team_size) num_threads(size) { const size_t tid = omp_get_thread_num(); MyData value = (MyData)0; for (size_t k=0 ; k<size ; k++) value += (A[team_size + k] * B[(k * size) + tid]); C[team_size + tid] = value; } // omp threads } // omp teams } // omp target return; } void check(const MyData *const __restrict__ cpu_matrix, const MyData *const __restrict__ gpu_matrix) { int flag; for (size_t i=0 ; i<SIZE ; i++) flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); if (!flag) printf("\n\t Result OK"); else printf("\n\t Result wrong"); return; } int main() { double time; MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData)); assert(buffer != NULL); // host reference matrix A MyData *const restrict A = buffer; MyData *const restrict B = A + SIZE; MyData *const restrict C_CPU = B + SIZE; MyData *const restrict C_GPU = C_CPU + SIZE; for (size_t i=0 ; i<SIZE ; i++) { A[i] = drand48(); B[i] = drand48(); } ////////////////////////// CPU naive algorithm ////////////////////////////////////////// CPU_mat_mult(A, B, C_CPU, N); ///////////////////////////////////////////////////////////////////////////////////////// // copy/alloc data to the GPU #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE]) /////////////////////////// GPU naive algorithm //////////////////////////////////////// time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); GPU_mat_mult(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); printf("\n\t GPU naive time %lg [s]\n", (time / LOOP)); //////////////////////////////////////////////////////////////////////////////// /////////////////////////// GPU naive no loops algorithm //////////////////////////// time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); GPU_mat_mult_no_loops(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP)); //////////////////////////////////////////////////////////////////////////////// // free CPU memory free(buffer); // free GPU memory #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE]) printf("\n"); return EXIT_SUCCESS; }
cuda-omp/omp/7/mat_mult_block.c 0 → 100644 +208 −0 Original line number Diff line number Diff line //////////////////////////////////////////////////////////////////////////////////////////////// // - Block matrix multiplication algorithm // // const size_t Nblocks = (N / Bsize); // // // loop over blocks of matrix C // for (size_t ib=0 ; ib<Nblocks ; ib++) // { // for (size_t jb=0 ; jb<Nblocks ; jb++) // { // // // loop over blocks of rows of A // for (size_t kb=0 ; kb<Nblocks ; kb++) // // // // for (size_t i=0 ; i<N ; i++) // for (size_t j=0 ; j<N ; j++) // for (size_t k=0 ; k<_N ; k++) // C[(i * N) + j] += A[(i * N) + k] * B[(k * N) + j]; // // } // jb // } // ib // - Exploit shared-memory. //////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 31.07.2024 // code tested using nvhpc // // - Compile the code: // $ nvc -mp=gpu -gpu=ccnative,debug,lineinfo -target=gpu -Minfo=all -v classwork.c -o classwork_omp // - Run the code: // $ ./classwork_omp ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <time.h> #include <assert.h> #include <omp.h> #include <string.h> #define N 512 #define SIZE (N * N) // matrix size typedef double MyData; // do not change #define BLOCKSIZE 32 // number of threads per block // sanity check #if BLOCKSIZE > 1024 #error BLOCKSIZE cannot be larger than 1024 #endif #define LOOP 100 #define NDEBUG double wall_time() { struct timespec ts; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); const double ret = (double) (ts.tv_sec) + (double) ts.tv_nsec * 1.0e-9; return ret; } void CPU_mat_mult(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { for (size_t i=0 ; i<size ; i++) for (size_t j=0 ; j<size ; j++) for (size_t k=0 ; k<size ; k++) C[(i * size) + j] += (A[(i * size) + k] * B[(k * size) + j]); return; } void GPU_mat_mult(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { #pragma omp target { #pragma omp teams distribute num_teams(size) for (size_t i=0 ; i<size ; i++) { #pragma omp parallel for num_threads(size) for (size_t j=0 ; j<size ; j++) { MyData value = (MyData)0; for (size_t k=0 ; k<size ; k++) value += (A[(i * size) + k] * B[(k * size) + j]); C[(i * size) + j] = value; } // omp thread } // omp teams } // omp target return; } void GPU_mat_mult_no_loops(const MyData *const restrict A, const MyData *const restrict B, MyData *const restrict C, const size_t size) { #pragma omp target { #pragma omp teams num_teams(size) { const size_t team_size = (size * omp_get_team_num()); #pragma omp parallel firstprivate(team_size) num_threads(size) { const size_t tid = omp_get_thread_num(); MyData value = (MyData)0; for (size_t k=0 ; k<size ; k++) value += (A[team_size + k] * B[(k * size) + tid]); C[team_size + tid] = value; } // omp threads } // omp teams } // omp target return; } void check(const MyData *const __restrict__ cpu_matrix, const MyData *const __restrict__ gpu_matrix) { int flag; for (size_t i=0 ; i<SIZE ; i++) flag = ((cpu_matrix[i] != gpu_matrix[i]) ? 1 : 0); if (!flag) printf("\n\t Result OK"); else printf("\n\t Result wrong"); return; } int main() { double time; MyData *buffer = (MyData *)calloc(4 * SIZE, sizeof(MyData)); assert(buffer != NULL); // host reference matrix A MyData *const restrict A = buffer; MyData *const restrict B = A + SIZE; MyData *const restrict C_CPU = B + SIZE; MyData *const restrict C_GPU = C_CPU + SIZE; for (size_t i=0 ; i<SIZE ; i++) { A[i] = drand48(); B[i] = drand48(); } ////////////////////////// CPU naive algorithm ////////////////////////////////////////// CPU_mat_mult(A, B, C_CPU, N); ///////////////////////////////////////////////////////////////////////////////////////// // copy/alloc data to the GPU #pragma omp target enter data map(to: A[0:SIZE], B[0:SIZE]) map(alloc: C_GPU[0:SIZE]) /////////////////////////// GPU naive algorithm //////////////////////////////////////// time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); GPU_mat_mult(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); printf("\n\t GPU naive time %lg [s]\n", (time / LOOP)); //////////////////////////////////////////////////////////////////////////////// /////////////////////////// GPU naive no loops algorithm //////////////////////////// time = 0.0; for (unsigned short int loop=0 ; loop<LOOP ; loop++) { const double start = wall_time(); GPU_mat_mult_no_loops(A, B, C_GPU, N); time += (wall_time() - start); } #pragma omp target update from(C_GPU[0:SIZE]) check(C_CPU, C_GPU); printf("\n\t GPU naive no loops time %lg [s]\n", (time / LOOP)); //////////////////////////////////////////////////////////////////////////////// // free CPU memory free(buffer); // free GPU memory #pragma omp target exit data map(delete: A[0:SIZE], B[0:SIZE], C_CPU[0:SIZE]) printf("\n"); return EXIT_SUCCESS; }
cuda-omp/omp/miscellaneous/structure.c +34 −25 Original line number Diff line number Diff line Loading @@ -17,7 +17,8 @@ #include <assert.h> #include <omp.h> #define N 8 #define SIZE 8 #define SIZE_2 (SIZE / 2) typedef double MyData; typedef struct my_span Loading @@ -29,26 +30,20 @@ typedef struct my_span void allocate( span *my_struct, const size_t size) { span tmp; /* allocate the buffer on the host memory */ tmp.ptr = (MyData *)malloc(size * sizeof(MyData)); assert(tmp.ptr != NULL); tmp.N = size; /* declare how the object 'span' has to be mapped on the device */ #pragma omp declare mapper(span tmp) map(from: tmp, tmp.ptr[0: tmp.N]) my_struct->ptr = tmp.ptr; my_struct->N = tmp.N; my_struct->ptr = (MyData *)calloc(size, sizeof(MyData)); assert(my_struct->ptr != NULL); my_struct->N = size; return; } void print(const span *const A) void print(const span *const A, const char *const string) { printf("\n"); for (size_t i=0 ; i<A->N ; i++) printf("\n\t array[%i] = %lg", i, A->ptr[i]); for (int i=0 ; i<A->N ; i++) printf("\n\t %s[%d] = %lg", string, i, A->ptr[i]); printf("\n\n"); return; Loading @@ -56,27 +51,41 @@ void print(const span *const A) int main() { span A, B; span A, B, C; allocate(&A, N); allocate(&B, N); allocate(&A, SIZE); allocate(&B, SIZE); allocate(&C, SIZE); /* declare how the object 'span' has to be mapped on the device */ #pragma omp declare mapper(all : span S) map(to: S) map(from: S.ptr[0 : S.N]) #pragma omp declare mapper(left : span S) map(to: S) map(from: S.ptr[0 : S.N/2]) #pragma omp declare mapper(right: span S) map(to: S) map(from: S.ptr[S.N/2: S.N/2]) /* init on the GPU */ #pragma omp target { for (size_t i=0 ; i<N ; i++) #pragma omp target map(mapper(all): A) map(mapper(left): B) map(mapper(right): C) { #pragma omp loop for (size_t i=0 ; i<SIZE ; i++) A.ptr[i] = (MyData)(i); B.ptr[i] = (MyData)(2 * i); } #pragma omp loop for (size_t ii=0 ; ii<SIZE_2 ; ii++) B.ptr[ii] = (MyData)(ii); #pragma omp loop for (size_t iii=SIZE_2 ; iii<SIZE ; iii++) C.ptr[iii] = (MyData)(iii); } print(&A); print(&B); print(&A, "A"); print(&B, "B"); print(&C, "C"); /* free the host's memory */ free(A.ptr); free(B.ptr); free(C.ptr); return 0; }
jacobi/serial/not_opt/jacobi_serial_not_opt_len14 0 → 100755 +17 KiB File added.No diff preview for this file type. View file