Loading cuda-omp/cuda/1/classwork_1.cu 0 → 100644 +62 −0 Original line number Diff line number Diff line ////////////////////////////////////////////////////////////////////////////////////////////////// // Assigment : write a CUDA code corresponding to the // following sequential C code // // #include <stdio.h> // #define N 100 // int main() // { // for (int i=0 ; i<N ; i++) // printf("%d\n", (i * i)); // return 0; // } ////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 06.07.2024 // // - Compile the code: // $ nvcc classwork_1.cu -o classwork_1_cuda // - Run the code: // $ ./classwork_1_cuda // - Check the result: // $ ./classwork_1_cuda | tail -n 100 | sort -nk 5 ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <cuda.h> #define N 100 #define NThreads 1024 __global__ void GPUkernel(const int size) { const int myID = threadIdx.x + (blockIdx.x * blockDim.x); if (myID >= size) return; // C printf is supported on CUDA // C++ cout class is not supported in CUDA printf("Hello from CUDA thread: %d - result %d\n", myID, (myID * myID)); return; } int main() { printf("\n\t The host issues the kernel on the GPU \n"); // kernel lunch GPUkernel<<<1, NThreads>>>(N); printf("\n\t cudaDeviceSynchronize \n"); // GPU synchronization cudaDeviceSynchronize(); return 0; } cuda-omp/cuda/1/classwork_2.cu 0 → 100644 +79 −0 Original line number Diff line number Diff line ////////////////////////////////////////////////////////////////////////////////////////////////// // Assigment : write a CUDA code corresponding to the // following sequential C code // // #include <stdio.h> // #define N 100 // int main() // { // int A[N]; // // for (int i=0 ; i<N ; i++) // A[i] = (i * i); // // return 0; // } ////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 17.11.2022 // // - Compile the code: // $ nvcc classwork_2.cu -o classwork_2 // - Run the code: // $ ./classwork_2 // - Check the result: // $ ./classwork_2 | tail -n 100 | sort -nk 5 ////////////////////////////////////////////////////////////////////////////////////////////////// #include <iostream> #include <stdlib.h> #include <cuda.h> #define N 100 #define NThreads 1024 __global__ void GPUkernel( int *A, const int size) { const int myID = threadIdx.x + (blockIdx.x * blockDim.x); if (myID < size) A[myID] = (myID * myID); return; } int main() { // allocate array that allows direct access of both host and device // CUDA is responsible int *A; const size_t size = (N * sizeof(int)); cudaError error = cudaMallocManaged(&A, size); if (!error) std::cout << "Memory allocated for the host/device" << std::endl; else { std::cout << "Cannot allocate memory for the host/device. CUDA error : " << error << " ... aborting" << std::endl; exit(EXIT_FAILURE); } // kernel lunch GPUkernel<<<1, NThreads>>>(A, N); // device synchronization cudaDeviceSynchronize(); // check the result for (size_t i=0 ; i<N ; i++) std::cout << "A[" << i << "] - Result: " << A[i] << std::endl; // free the memory cudaFree(A); return 0; } cuda-omp/omp/1/classwork_1.c 0 → 100644 +62 −0 Original line number Diff line number Diff line ////////////////////////////////////////////////////////////////////////////////////////////////// // // OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70' // // Assigment : write a OMP-GPU code corresponding to the // following sequential C code // // #include <stdio.h> // #define N 100 // int main() // { // for (int i=0 ; i<N ; i++) // printf("%d\n", (i * i)); // return 0; // } ////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 06.07.2024 // // - Compile the code to run on : // $ nvcc classwork_1.c -o classwork_1_omp // - Run the code: // $ ./classwork_1_omp // - Check the result: // $ ./classwork_1_omp | tail -n 100 | sort -nk 5 ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <omp.h> #define N 100 #define NThreads 1024 void GPUkernelSerial(const int size) { #pragma omp target { if (!omp_is_initial_device()) printf("\n\t GPU is executing GPUkernelSerial\n" ); else printf("\n\t CPU is executing GPUkernelSerial\n" ); for (int i=0 ; i<size ; i++) printf("Hello from OMP-GPU thread: %d - result %d\n", i, (i * i)); } return; } int main() { printf("\n\t The host issues the kernel on the GPU \n"); /* kernel lunch using one GPU thread */ GPUkernelSerial(N); return 0; } Loading
cuda-omp/cuda/1/classwork_1.cu 0 → 100644 +62 −0 Original line number Diff line number Diff line ////////////////////////////////////////////////////////////////////////////////////////////////// // Assigment : write a CUDA code corresponding to the // following sequential C code // // #include <stdio.h> // #define N 100 // int main() // { // for (int i=0 ; i<N ; i++) // printf("%d\n", (i * i)); // return 0; // } ////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 06.07.2024 // // - Compile the code: // $ nvcc classwork_1.cu -o classwork_1_cuda // - Run the code: // $ ./classwork_1_cuda // - Check the result: // $ ./classwork_1_cuda | tail -n 100 | sort -nk 5 ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <cuda.h> #define N 100 #define NThreads 1024 __global__ void GPUkernel(const int size) { const int myID = threadIdx.x + (blockIdx.x * blockDim.x); if (myID >= size) return; // C printf is supported on CUDA // C++ cout class is not supported in CUDA printf("Hello from CUDA thread: %d - result %d\n", myID, (myID * myID)); return; } int main() { printf("\n\t The host issues the kernel on the GPU \n"); // kernel lunch GPUkernel<<<1, NThreads>>>(N); printf("\n\t cudaDeviceSynchronize \n"); // GPU synchronization cudaDeviceSynchronize(); return 0; }
cuda-omp/cuda/1/classwork_2.cu 0 → 100644 +79 −0 Original line number Diff line number Diff line ////////////////////////////////////////////////////////////////////////////////////////////////// // Assigment : write a CUDA code corresponding to the // following sequential C code // // #include <stdio.h> // #define N 100 // int main() // { // int A[N]; // // for (int i=0 ; i<N ; i++) // A[i] = (i * i); // // return 0; // } ////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 17.11.2022 // // - Compile the code: // $ nvcc classwork_2.cu -o classwork_2 // - Run the code: // $ ./classwork_2 // - Check the result: // $ ./classwork_2 | tail -n 100 | sort -nk 5 ////////////////////////////////////////////////////////////////////////////////////////////////// #include <iostream> #include <stdlib.h> #include <cuda.h> #define N 100 #define NThreads 1024 __global__ void GPUkernel( int *A, const int size) { const int myID = threadIdx.x + (blockIdx.x * blockDim.x); if (myID < size) A[myID] = (myID * myID); return; } int main() { // allocate array that allows direct access of both host and device // CUDA is responsible int *A; const size_t size = (N * sizeof(int)); cudaError error = cudaMallocManaged(&A, size); if (!error) std::cout << "Memory allocated for the host/device" << std::endl; else { std::cout << "Cannot allocate memory for the host/device. CUDA error : " << error << " ... aborting" << std::endl; exit(EXIT_FAILURE); } // kernel lunch GPUkernel<<<1, NThreads>>>(A, N); // device synchronization cudaDeviceSynchronize(); // check the result for (size_t i=0 ; i<N ; i++) std::cout << "A[" << i << "] - Result: " << A[i] << std::endl; // free the memory cudaFree(A); return 0; }
cuda-omp/omp/1/classwork_1.c 0 → 100644 +62 −0 Original line number Diff line number Diff line ////////////////////////////////////////////////////////////////////////////////////////////////// // // OpenMP GPU Offload is available only on systems with NVIDIA GPUs with compute capability '>= cc70' // // Assigment : write a OMP-GPU code corresponding to the // following sequential C code // // #include <stdio.h> // #define N 100 // int main() // { // for (int i=0 ; i<N ; i++) // printf("%d\n", (i * i)); // return 0; // } ////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// // Author: David Goz // mail : david.goz@inaf.it // date : 06.07.2024 // // - Compile the code to run on : // $ nvcc classwork_1.c -o classwork_1_omp // - Run the code: // $ ./classwork_1_omp // - Check the result: // $ ./classwork_1_omp | tail -n 100 | sort -nk 5 ////////////////////////////////////////////////////////////////////////////////////////////////// #include <stdio.h> #include <omp.h> #define N 100 #define NThreads 1024 void GPUkernelSerial(const int size) { #pragma omp target { if (!omp_is_initial_device()) printf("\n\t GPU is executing GPUkernelSerial\n" ); else printf("\n\t CPU is executing GPUkernelSerial\n" ); for (int i=0 ; i<size ; i++) printf("Hello from OMP-GPU thread: %d - result %d\n", i, (i * i)); } return; } int main() { printf("\n\t The host issues the kernel on the GPU \n"); /* kernel lunch using one GPU thread */ GPUkernelSerial(N); return 0; }