Loading cuda-omp/cuda/miscellaneous/bank_conflict.cu 0 → 100644 +154 −0 Original line number Diff line number Diff line #include <stdio.h> __global__ void bankConflictKernel(int* input, int* output, int numElements) { // Declare shared memory __shared__ int shMem[1024]; // Max 1024 elements for simplicity int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < numElements) { // Load data from global memory into shared memory // For demonstration, let's assume input data is already in shared memory // In a real scenario, you'd copy it from global memory. if (tid < 1024) { shMem[tid] = input[tid]; } __syncthreads(); // Ensure all shared memory writes are visible // --- Bank Conflict Scenario --- // Each thread accesses shMem[tid] directly. // If blockDim.x is a multiple of 32 (e.g., 256), and threads // in a warp access shMem[0], shMem[1], ..., shMem[31], // then shMem[0] and shMem[32] are in the same bank, shMem[1] and shMem[33] // are in the same bank, etc., causing conflicts if accessed by the same warp. // More specifically, if threads in a warp try to access shMem[k], shMem[k+1], ..., shMem[k+31], // and another warp tries to access shMem[k+32], shMem[k+33], ..., shMem[k+63], // there might be conflicts if the access pattern of the first warp aligns // with the bank organization (e.g., consecutive addresses). // // A more direct bank conflict example is when threads in the *same warp* // access addresses that fall into the same bank. // For example, if warp size is 32, thread 0 accesses shMem[0], thread 1 accesses shMem[1], ..., // thread 31 accesses shMem[31]. If we then have a pattern like: // shMem[threadIdx.x * 32] - this will cause a conflict. // Thread 0 accesses shMem[0], Thread 1 accesses shMem[32], Thread 2 accesses shMem[64]... // shMem[0], shMem[32], shMem[64], ... all map to bank 0 (for 4-byte words). // So, all 32 threads in a warp would try to access bank 0 simultaneously. // Simulating the conflict: // Let's assume a blockDim.x of 256. // Threads in a warp (e.g., threads 0-31) try to access elements // that are 32 elements apart. // Thread 0 accesses shMem[0] // Thread 1 accesses shMem[32] // Thread 2 accesses shMem[64] // ... // All these accesses will target the same bank (bank 0) repeatedly. if (tid < 1024) { // Ensure within bounds for shared memory int value = shMem[threadIdx.x * 32]; // Intentional bank conflict atomicAdd(&output[0], value); // Accumulate for demonstration } __syncthreads(); } } // Example: Mitigating Bank Conflict (Padding) __global__ void bankConflictMitigationKernel(int* input, int* output, int numElements) { // Declare shared memory with padding // For 32 banks and 4-byte words, each bank holds words with addresses: // Bank 0: 0, 32, 64, ... // Bank 1: 1, 33, 65, ... // ... // Bank 31: 31, 63, 95, ... // To avoid conflicts when accessing elements with stride 32, we add padding. // We add 1 to the size for each 32 elements to shift the bank alignment. // A common strategy is to pad each "row" or "stride" by 1 element. // If we have rows of size BLOCK_WIDTH and we're accessing columns, // we declare shared memory as [BLOCK_HEIGHT][BLOCK_WIDTH + PADDING]. // Here, for a 1D array where we want to access shMem[threadIdx.x * stride], // we can conceptually pad by adding a dummy element every 32 elements. // For simplicity and demonstration, let's use a fixed size and show the concept. const int SHMEM_SIZE = 1024; // Calculate effective size with padding. For every 32 elements, add 1 extra element. // This creates "holes" that shift the bank alignment. __shared__ int shMemPadded[SHMEM_SIZE + SHMEM_SIZE / 32]; // Simple padding for demonstration int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < numElements) { if (tid < SHMEM_SIZE) { // Load data, accounting for padding // This mapping ensures that consecutive logical elements are not in consecutive physical banks // if accessed with a stride that would normally cause conflicts. // For this specific conflict (shMem[threadIdx.x * 32]), we need to // ensure shMemPadded[k*32] and shMemPadded[(k+1)*32] are not in the same bank. // By adding a padding element for every 32 elements, we essentially shift // the bank index. // Example: // Logical index `i` maps to `i + i / 32` in padded array. // shMemPadded[0] for logical 0 (bank 0) // shMemPadded[32] for logical 32 -> maps to shMemPadded[32 + 1] = shMemPadded[33] (bank 1) // shMemPadded[64] for logical 64 -> maps to shMemPadded[64 + 2] = shMemPadded[66] (bank 2) // So, consecutive logical strides of 32 elements now map to different banks. shMemPadded[tid + tid / 32] = input[tid]; } __syncthreads(); if (tid < SHMEM_SIZE) { int value = shMemPadded[threadIdx.x * 32 + (threadIdx.x * 32) / 32]; // Access with padding atomicAdd(&output[1], value); // Accumulate for demonstration } __syncthreads(); } } int main() { int numElements = 1024; // Example size int* h_input; int* d_input; int* h_output; int* d_output; h_input = (int*)malloc(numElements * sizeof(int)); h_output = (int*)malloc(2 * sizeof(int)); // output[0] for conflict, output[1] for no conflict h_output[0] = 0; h_output[1] = 0; for (int i = 0; i < numElements; ++i) { h_input[i] = i + 1; // Initialize input } cudaMalloc((void**)&d_input, numElements * sizeof(int)); cudaMalloc((void**)&d_output, 2 * sizeof(int)); cudaMemcpy(d_input, h_input, numElements * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_output, h_output, 2 * sizeof(int), cudaMemcpyHostToDevice); // Launch kernel with bank conflict int blockSize = 256; int gridSize = (numElements + blockSize - 1) / blockSize; bankConflictKernel<<<gridSize, blockSize>>>(d_input, d_output, numElements); cudaDeviceSynchronize(); // Launch kernel with bank conflict mitigation bankConflictMitigationKernel<<<gridSize, blockSize>>>(d_input, d_output, numElements); cudaDeviceSynchronize(); cudaMemcpy(h_output, d_output, 2 * sizeof(int), cudaMemcpyDeviceToHost); printf("Sum with bank conflict: %d\n", h_output[0]); printf("Sum with bank conflict mitigation: %d\n", h_output[1]); // Note: The actual performance difference due to bank conflicts // is best observed with profiling tools like nvprof or Nsight Compute. // The accumulated sums here will be the same if the logic is correct, // but the execution time will differ. cudaFree(d_input); cudaFree(d_output); free(h_input); free(h_output); return 0; } cuda-omp/omp/miscellaneous/omp_version.cpp 0 → 100644 +31 −0 Original line number Diff line number Diff line #include <iostream> // For C++ #include <map> #include <string> static const std::map<std::string, std::string> OPENMP_VERSION{ {"200505", "OpenMP 2.5"}, {"200805", "OpenMP 3.0"}, {"201107", "OpenMP 3.1"}, {"201307", "OpenMP 4.0"}, {"201511", "OpenMP 4.5"}, {"201811", "OpenMP 5.0"}, {"202011", "OpenMP 5.1"}, {"202111", "OpenMP 5.2"}, {"202411", "OpenMP 6.0"} }; int main() { #ifdef _OPENMP const auto item = OPENMP_VERSION.find(std::to_string(_OPENMP)); if (item != OPENMP_VERSION.end()) std::cout << "\n\t OpenMP version: " << item->second << "\n" << std::endl; else std::cout << "\n\t Unknown OpenMP version: " << _OPENMP << "\n" << std::endl; #else std::cout << "\n\t OpenMP is not supported by this compiler.\n" << std::endl; #endif return 0; } Loading
cuda-omp/cuda/miscellaneous/bank_conflict.cu 0 → 100644 +154 −0 Original line number Diff line number Diff line #include <stdio.h> __global__ void bankConflictKernel(int* input, int* output, int numElements) { // Declare shared memory __shared__ int shMem[1024]; // Max 1024 elements for simplicity int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < numElements) { // Load data from global memory into shared memory // For demonstration, let's assume input data is already in shared memory // In a real scenario, you'd copy it from global memory. if (tid < 1024) { shMem[tid] = input[tid]; } __syncthreads(); // Ensure all shared memory writes are visible // --- Bank Conflict Scenario --- // Each thread accesses shMem[tid] directly. // If blockDim.x is a multiple of 32 (e.g., 256), and threads // in a warp access shMem[0], shMem[1], ..., shMem[31], // then shMem[0] and shMem[32] are in the same bank, shMem[1] and shMem[33] // are in the same bank, etc., causing conflicts if accessed by the same warp. // More specifically, if threads in a warp try to access shMem[k], shMem[k+1], ..., shMem[k+31], // and another warp tries to access shMem[k+32], shMem[k+33], ..., shMem[k+63], // there might be conflicts if the access pattern of the first warp aligns // with the bank organization (e.g., consecutive addresses). // // A more direct bank conflict example is when threads in the *same warp* // access addresses that fall into the same bank. // For example, if warp size is 32, thread 0 accesses shMem[0], thread 1 accesses shMem[1], ..., // thread 31 accesses shMem[31]. If we then have a pattern like: // shMem[threadIdx.x * 32] - this will cause a conflict. // Thread 0 accesses shMem[0], Thread 1 accesses shMem[32], Thread 2 accesses shMem[64]... // shMem[0], shMem[32], shMem[64], ... all map to bank 0 (for 4-byte words). // So, all 32 threads in a warp would try to access bank 0 simultaneously. // Simulating the conflict: // Let's assume a blockDim.x of 256. // Threads in a warp (e.g., threads 0-31) try to access elements // that are 32 elements apart. // Thread 0 accesses shMem[0] // Thread 1 accesses shMem[32] // Thread 2 accesses shMem[64] // ... // All these accesses will target the same bank (bank 0) repeatedly. if (tid < 1024) { // Ensure within bounds for shared memory int value = shMem[threadIdx.x * 32]; // Intentional bank conflict atomicAdd(&output[0], value); // Accumulate for demonstration } __syncthreads(); } } // Example: Mitigating Bank Conflict (Padding) __global__ void bankConflictMitigationKernel(int* input, int* output, int numElements) { // Declare shared memory with padding // For 32 banks and 4-byte words, each bank holds words with addresses: // Bank 0: 0, 32, 64, ... // Bank 1: 1, 33, 65, ... // ... // Bank 31: 31, 63, 95, ... // To avoid conflicts when accessing elements with stride 32, we add padding. // We add 1 to the size for each 32 elements to shift the bank alignment. // A common strategy is to pad each "row" or "stride" by 1 element. // If we have rows of size BLOCK_WIDTH and we're accessing columns, // we declare shared memory as [BLOCK_HEIGHT][BLOCK_WIDTH + PADDING]. // Here, for a 1D array where we want to access shMem[threadIdx.x * stride], // we can conceptually pad by adding a dummy element every 32 elements. // For simplicity and demonstration, let's use a fixed size and show the concept. const int SHMEM_SIZE = 1024; // Calculate effective size with padding. For every 32 elements, add 1 extra element. // This creates "holes" that shift the bank alignment. __shared__ int shMemPadded[SHMEM_SIZE + SHMEM_SIZE / 32]; // Simple padding for demonstration int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid < numElements) { if (tid < SHMEM_SIZE) { // Load data, accounting for padding // This mapping ensures that consecutive logical elements are not in consecutive physical banks // if accessed with a stride that would normally cause conflicts. // For this specific conflict (shMem[threadIdx.x * 32]), we need to // ensure shMemPadded[k*32] and shMemPadded[(k+1)*32] are not in the same bank. // By adding a padding element for every 32 elements, we essentially shift // the bank index. // Example: // Logical index `i` maps to `i + i / 32` in padded array. // shMemPadded[0] for logical 0 (bank 0) // shMemPadded[32] for logical 32 -> maps to shMemPadded[32 + 1] = shMemPadded[33] (bank 1) // shMemPadded[64] for logical 64 -> maps to shMemPadded[64 + 2] = shMemPadded[66] (bank 2) // So, consecutive logical strides of 32 elements now map to different banks. shMemPadded[tid + tid / 32] = input[tid]; } __syncthreads(); if (tid < SHMEM_SIZE) { int value = shMemPadded[threadIdx.x * 32 + (threadIdx.x * 32) / 32]; // Access with padding atomicAdd(&output[1], value); // Accumulate for demonstration } __syncthreads(); } } int main() { int numElements = 1024; // Example size int* h_input; int* d_input; int* h_output; int* d_output; h_input = (int*)malloc(numElements * sizeof(int)); h_output = (int*)malloc(2 * sizeof(int)); // output[0] for conflict, output[1] for no conflict h_output[0] = 0; h_output[1] = 0; for (int i = 0; i < numElements; ++i) { h_input[i] = i + 1; // Initialize input } cudaMalloc((void**)&d_input, numElements * sizeof(int)); cudaMalloc((void**)&d_output, 2 * sizeof(int)); cudaMemcpy(d_input, h_input, numElements * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(d_output, h_output, 2 * sizeof(int), cudaMemcpyHostToDevice); // Launch kernel with bank conflict int blockSize = 256; int gridSize = (numElements + blockSize - 1) / blockSize; bankConflictKernel<<<gridSize, blockSize>>>(d_input, d_output, numElements); cudaDeviceSynchronize(); // Launch kernel with bank conflict mitigation bankConflictMitigationKernel<<<gridSize, blockSize>>>(d_input, d_output, numElements); cudaDeviceSynchronize(); cudaMemcpy(h_output, d_output, 2 * sizeof(int), cudaMemcpyDeviceToHost); printf("Sum with bank conflict: %d\n", h_output[0]); printf("Sum with bank conflict mitigation: %d\n", h_output[1]); // Note: The actual performance difference due to bank conflicts // is best observed with profiling tools like nvprof or Nsight Compute. // The accumulated sums here will be the same if the logic is correct, // but the execution time will differ. cudaFree(d_input); cudaFree(d_output); free(h_input); free(h_output); return 0; }
cuda-omp/omp/miscellaneous/omp_version.cpp 0 → 100644 +31 −0 Original line number Diff line number Diff line #include <iostream> // For C++ #include <map> #include <string> static const std::map<std::string, std::string> OPENMP_VERSION{ {"200505", "OpenMP 2.5"}, {"200805", "OpenMP 3.0"}, {"201107", "OpenMP 3.1"}, {"201307", "OpenMP 4.0"}, {"201511", "OpenMP 4.5"}, {"201811", "OpenMP 5.0"}, {"202011", "OpenMP 5.1"}, {"202111", "OpenMP 5.2"}, {"202411", "OpenMP 6.0"} }; int main() { #ifdef _OPENMP const auto item = OPENMP_VERSION.find(std::to_string(_OPENMP)); if (item != OPENMP_VERSION.end()) std::cout << "\n\t OpenMP version: " << item->second << "\n" << std::endl; else std::cout << "\n\t Unknown OpenMP version: " << _OPENMP << "\n" << std::endl; #else std::cout << "\n\t OpenMP is not supported by this compiler.\n" << std::endl; #endif return 0; }