Commit 4d71ee9b authored by David Goz's avatar David Goz 😴
Browse files

CUDA bank conflict added

parent 5a864f5c
Loading
Loading
Loading
Loading
+154 −0
Original line number Diff line number Diff line
#include <stdio.h>

__global__ void bankConflictKernel(int* input, int* output, int numElements) {
    // Declare shared memory
    __shared__ int shMem[1024]; // Max 1024 elements for simplicity

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < numElements) {
        // Load data from global memory into shared memory
        // For demonstration, let's assume input data is already in shared memory
        // In a real scenario, you'd copy it from global memory.
        if (tid < 1024) {
            shMem[tid] = input[tid];
        }
        __syncthreads(); // Ensure all shared memory writes are visible

        // --- Bank Conflict Scenario ---
        // Each thread accesses shMem[tid] directly.
        // If blockDim.x is a multiple of 32 (e.g., 256), and threads
        // in a warp access shMem[0], shMem[1], ..., shMem[31],
        // then shMem[0] and shMem[32] are in the same bank, shMem[1] and shMem[33]
        // are in the same bank, etc., causing conflicts if accessed by the same warp.
        // More specifically, if threads in a warp try to access shMem[k], shMem[k+1], ..., shMem[k+31],
        // and another warp tries to access shMem[k+32], shMem[k+33], ..., shMem[k+63],
        // there might be conflicts if the access pattern of the first warp aligns
        // with the bank organization (e.g., consecutive addresses).
        //
        // A more direct bank conflict example is when threads in the *same warp*
        // access addresses that fall into the same bank.
        // For example, if warp size is 32, thread 0 accesses shMem[0], thread 1 accesses shMem[1], ...,
        // thread 31 accesses shMem[31]. If we then have a pattern like:
        // shMem[threadIdx.x * 32] - this will cause a conflict.
        // Thread 0 accesses shMem[0], Thread 1 accesses shMem[32], Thread 2 accesses shMem[64]...
        // shMem[0], shMem[32], shMem[64], ... all map to bank 0 (for 4-byte words).
        // So, all 32 threads in a warp would try to access bank 0 simultaneously.

        // Simulating the conflict:
        // Let's assume a blockDim.x of 256.
        // Threads in a warp (e.g., threads 0-31) try to access elements
        // that are 32 elements apart.
        // Thread 0 accesses shMem[0]
        // Thread 1 accesses shMem[32]
        // Thread 2 accesses shMem[64]
        // ...
        // All these accesses will target the same bank (bank 0) repeatedly.
        if (tid < 1024) { // Ensure within bounds for shared memory
            int value = shMem[threadIdx.x * 32]; // Intentional bank conflict
            atomicAdd(&output[0], value); // Accumulate for demonstration
        }
        __syncthreads();
    }
}

// Example: Mitigating Bank Conflict (Padding)
__global__ void bankConflictMitigationKernel(int* input, int* output, int numElements) {
    // Declare shared memory with padding
    // For 32 banks and 4-byte words, each bank holds words with addresses:
    // Bank 0: 0, 32, 64, ...
    // Bank 1: 1, 33, 65, ...
    // ...
    // Bank 31: 31, 63, 95, ...
    // To avoid conflicts when accessing elements with stride 32, we add padding.
    // We add 1 to the size for each 32 elements to shift the bank alignment.
    // A common strategy is to pad each "row" or "stride" by 1 element.
    // If we have rows of size BLOCK_WIDTH and we're accessing columns,
    // we declare shared memory as [BLOCK_HEIGHT][BLOCK_WIDTH + PADDING].
    // Here, for a 1D array where we want to access shMem[threadIdx.x * stride],
    // we can conceptually pad by adding a dummy element every 32 elements.
    // For simplicity and demonstration, let's use a fixed size and show the concept.

    const int SHMEM_SIZE = 1024;
    // Calculate effective size with padding. For every 32 elements, add 1 extra element.
    // This creates "holes" that shift the bank alignment.
    __shared__ int shMemPadded[SHMEM_SIZE + SHMEM_SIZE / 32]; // Simple padding for demonstration

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < numElements) {
        if (tid < SHMEM_SIZE) {
            // Load data, accounting for padding
            // This mapping ensures that consecutive logical elements are not in consecutive physical banks
            // if accessed with a stride that would normally cause conflicts.
            // For this specific conflict (shMem[threadIdx.x * 32]), we need to
            // ensure shMemPadded[k*32] and shMemPadded[(k+1)*32] are not in the same bank.
            // By adding a padding element for every 32 elements, we essentially shift
            // the bank index.
            // Example:
            // Logical index `i` maps to `i + i / 32` in padded array.
            // shMemPadded[0] for logical 0 (bank 0)
            // shMemPadded[32] for logical 32 -> maps to shMemPadded[32 + 1] = shMemPadded[33] (bank 1)
            // shMemPadded[64] for logical 64 -> maps to shMemPadded[64 + 2] = shMemPadded[66] (bank 2)
            // So, consecutive logical strides of 32 elements now map to different banks.
            shMemPadded[tid + tid / 32] = input[tid];
        }
        __syncthreads();

        if (tid < SHMEM_SIZE) {
            int value = shMemPadded[threadIdx.x * 32 + (threadIdx.x * 32) / 32]; // Access with padding
            atomicAdd(&output[1], value); // Accumulate for demonstration
        }
        __syncthreads();
    }
}

int main() {
    int numElements = 1024; // Example size
    int* h_input;
    int* d_input;
    int* h_output;
    int* d_output;

    h_input = (int*)malloc(numElements * sizeof(int));
    h_output = (int*)malloc(2 * sizeof(int)); // output[0] for conflict, output[1] for no conflict
    h_output[0] = 0;
    h_output[1] = 0;

    for (int i = 0; i < numElements; ++i) {
        h_input[i] = i + 1; // Initialize input
    }

    cudaMalloc((void**)&d_input, numElements * sizeof(int));
    cudaMalloc((void**)&d_output, 2 * sizeof(int));

    cudaMemcpy(d_input, h_input, numElements * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_output, h_output, 2 * sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel with bank conflict
    int blockSize = 256;
    int gridSize = (numElements + blockSize - 1) / blockSize;
    bankConflictKernel<<<gridSize, blockSize>>>(d_input, d_output, numElements);
    cudaDeviceSynchronize();

    // Launch kernel with bank conflict mitigation
    bankConflictMitigationKernel<<<gridSize, blockSize>>>(d_input, d_output, numElements);
    cudaDeviceSynchronize();

    cudaMemcpy(h_output, d_output, 2 * sizeof(int), cudaMemcpyDeviceToHost);

    printf("Sum with bank conflict: %d\n", h_output[0]);
    printf("Sum with bank conflict mitigation: %d\n", h_output[1]);

    // Note: The actual performance difference due to bank conflicts
    // is best observed with profiling tools like nvprof or Nsight Compute.
    // The accumulated sums here will be the same if the logic is correct,
    // but the execution time will differ.

    cudaFree(d_input);
    cudaFree(d_output);
    free(h_input);
    free(h_output);

    return 0;
}
+31 −0
Original line number Diff line number Diff line
#include <iostream> // For C++
#include <map>
#include <string>

static const std::map<std::string, std::string> OPENMP_VERSION{
  {"200505", "OpenMP 2.5"},
  {"200805", "OpenMP 3.0"},
  {"201107", "OpenMP 3.1"},
  {"201307", "OpenMP 4.0"},
  {"201511", "OpenMP 4.5"},
  {"201811", "OpenMP 5.0"},
  {"202011", "OpenMP 5.1"},
  {"202111", "OpenMP 5.2"},
  {"202411", "OpenMP 6.0"}
};

int main()
{
#ifdef _OPENMP

  const auto item = OPENMP_VERSION.find(std::to_string(_OPENMP));
  if (item != OPENMP_VERSION.end())
    std::cout << "\n\t OpenMP version: " << item->second << "\n" << std::endl;
  else
    std::cout << "\n\t Unknown OpenMP version: " << _OPENMP << "\n" << std::endl;
#else
  std::cout << "\n\t OpenMP is not supported by this compiler.\n" << std::endl;
#endif

  return 0;
}