CUDA bank conflict added (4d71ee9b) · Commits · HPC_SCHOOL / HPC_SCHOOL_2024

cuda-omp/cuda/miscellaneous/bank_conflict.cu

0 → 100644

+154 −0

Original line number	Diff line number	Diff line
		#include <stdio.h>

		__global__ void bankConflictKernel(int* input, int* output, int numElements) {
		// Declare shared memory
		__shared__ int shMem[1024]; // Max 1024 elements for simplicity

		int tid = threadIdx.x + blockIdx.x * blockDim.x;

		if (tid < numElements) {
		// Load data from global memory into shared memory
		// For demonstration, let's assume input data is already in shared memory
		// In a real scenario, you'd copy it from global memory.
		if (tid < 1024) {
		shMem[tid] = input[tid];
		}
		__syncthreads(); // Ensure all shared memory writes are visible

		// --- Bank Conflict Scenario ---
		// Each thread accesses shMem[tid] directly.
		// If blockDim.x is a multiple of 32 (e.g., 256), and threads
		// in a warp access shMem[0], shMem[1], ..., shMem[31],
		// then shMem[0] and shMem[32] are in the same bank, shMem[1] and shMem[33]
		// are in the same bank, etc., causing conflicts if accessed by the same warp.
		// More specifically, if threads in a warp try to access shMem[k], shMem[k+1], ..., shMem[k+31],
		// and another warp tries to access shMem[k+32], shMem[k+33], ..., shMem[k+63],
		// there might be conflicts if the access pattern of the first warp aligns
		// with the bank organization (e.g., consecutive addresses).
		//
		// A more direct bank conflict example is when threads in the same warp
		// access addresses that fall into the same bank.
		// For example, if warp size is 32, thread 0 accesses shMem[0], thread 1 accesses shMem[1], ...,
		// thread 31 accesses shMem[31]. If we then have a pattern like:
		// shMem[threadIdx.x * 32] - this will cause a conflict.
		// Thread 0 accesses shMem[0], Thread 1 accesses shMem[32], Thread 2 accesses shMem[64]...
		// shMem[0], shMem[32], shMem[64], ... all map to bank 0 (for 4-byte words).
		// So, all 32 threads in a warp would try to access bank 0 simultaneously.

		// Simulating the conflict:
		// Let's assume a blockDim.x of 256.
		// Threads in a warp (e.g., threads 0-31) try to access elements
		// that are 32 elements apart.
		// Thread 0 accesses shMem[0]
		// Thread 1 accesses shMem[32]
		// Thread 2 accesses shMem[64]
		// ...
		// All these accesses will target the same bank (bank 0) repeatedly.
		if (tid < 1024) { // Ensure within bounds for shared memory
		int value = shMem[threadIdx.x * 32]; // Intentional bank conflict
		atomicAdd(&output[0], value); // Accumulate for demonstration
		}
		__syncthreads();
		}
		}

		// Example: Mitigating Bank Conflict (Padding)
		__global__ void bankConflictMitigationKernel(int* input, int* output, int numElements) {
		// Declare shared memory with padding
		// For 32 banks and 4-byte words, each bank holds words with addresses:
		// Bank 0: 0, 32, 64, ...
		// Bank 1: 1, 33, 65, ...
		// ...
		// Bank 31: 31, 63, 95, ...
		// To avoid conflicts when accessing elements with stride 32, we add padding.
		// We add 1 to the size for each 32 elements to shift the bank alignment.
		// A common strategy is to pad each "row" or "stride" by 1 element.
		// If we have rows of size BLOCK_WIDTH and we're accessing columns,
		// we declare shared memory as [BLOCK_HEIGHT][BLOCK_WIDTH + PADDING].
		// Here, for a 1D array where we want to access shMem[threadIdx.x * stride],
		// we can conceptually pad by adding a dummy element every 32 elements.
		// For simplicity and demonstration, let's use a fixed size and show the concept.

		const int SHMEM_SIZE = 1024;
		// Calculate effective size with padding. For every 32 elements, add 1 extra element.
		// This creates "holes" that shift the bank alignment.
		__shared__ int shMemPadded[SHMEM_SIZE + SHMEM_SIZE / 32]; // Simple padding for demonstration

		int tid = threadIdx.x + blockIdx.x * blockDim.x;

		if (tid < numElements) {
		if (tid < SHMEM_SIZE) {
		// Load data, accounting for padding
		// This mapping ensures that consecutive logical elements are not in consecutive physical banks
		// if accessed with a stride that would normally cause conflicts.
		// For this specific conflict (shMem[threadIdx.x * 32]), we need to
		// ensure shMemPadded[k32] and shMemPadded[(k+1)32] are not in the same bank.
		// By adding a padding element for every 32 elements, we essentially shift
		// the bank index.
		// Example:
		// Logical index `i` maps to `i + i / 32` in padded array.
		// shMemPadded[0] for logical 0 (bank 0)
		// shMemPadded[32] for logical 32 -> maps to shMemPadded[32 + 1] = shMemPadded[33] (bank 1)
		// shMemPadded[64] for logical 64 -> maps to shMemPadded[64 + 2] = shMemPadded[66] (bank 2)
		// So, consecutive logical strides of 32 elements now map to different banks.
		shMemPadded[tid + tid / 32] = input[tid];
		}
		__syncthreads();

		if (tid < SHMEM_SIZE) {
		int value = shMemPadded[threadIdx.x * 32 + (threadIdx.x * 32) / 32]; // Access with padding
		atomicAdd(&output[1], value); // Accumulate for demonstration
		}
		__syncthreads();
		}
		}

		int main() {
		int numElements = 1024; // Example size
		int* h_input;
		int* d_input;
		int* h_output;
		int* d_output;

		h_input = (int)malloc(numElements sizeof(int));
		h_output = (int)malloc(2 sizeof(int)); // output[0] for conflict, output[1] for no conflict
		h_output[0] = 0;
		h_output[1] = 0;

		for (int i = 0; i < numElements; ++i) {
		h_input[i] = i + 1; // Initialize input
		}

		cudaMalloc((void*)&d_input, numElements sizeof(int));
		cudaMalloc((void*)&d_output, 2 sizeof(int));

		cudaMemcpy(d_input, h_input, numElements * sizeof(int), cudaMemcpyHostToDevice);
		cudaMemcpy(d_output, h_output, 2 * sizeof(int), cudaMemcpyHostToDevice);

		// Launch kernel with bank conflict
		int blockSize = 256;
		int gridSize = (numElements + blockSize - 1) / blockSize;
		bankConflictKernel<<<gridSize, blockSize>>>(d_input, d_output, numElements);
		cudaDeviceSynchronize();

		// Launch kernel with bank conflict mitigation
		bankConflictMitigationKernel<<<gridSize, blockSize>>>(d_input, d_output, numElements);
		cudaDeviceSynchronize();

		cudaMemcpy(h_output, d_output, 2 * sizeof(int), cudaMemcpyDeviceToHost);

		printf("Sum with bank conflict: %d\n", h_output[0]);
		printf("Sum with bank conflict mitigation: %d\n", h_output[1]);

		// Note: The actual performance difference due to bank conflicts
		// is best observed with profiling tools like nvprof or Nsight Compute.
		// The accumulated sums here will be the same if the logic is correct,
		// but the execution time will differ.

		cudaFree(d_input);
		cudaFree(d_output);
		free(h_input);
		free(h_output);

		return 0;
		}

cuda-omp/omp/miscellaneous/omp_version.cpp

0 → 100644

+31 −0

Original line number	Diff line number	Diff line
		#include <iostream> // For C++
		#include <map>
		#include <string>

		static const std::map<std::string, std::string> OPENMP_VERSION{
		{"200505", "OpenMP 2.5"},
		{"200805", "OpenMP 3.0"},
		{"201107", "OpenMP 3.1"},
		{"201307", "OpenMP 4.0"},
		{"201511", "OpenMP 4.5"},
		{"201811", "OpenMP 5.0"},
		{"202011", "OpenMP 5.1"},
		{"202111", "OpenMP 5.2"},
		{"202411", "OpenMP 6.0"}
		};

		int main()
		{
		#ifdef _OPENMP

		const auto item = OPENMP_VERSION.find(std::to_string(_OPENMP));
		if (item != OPENMP_VERSION.end())
		std::cout << "\n\t OpenMP version: " << item->second << "\n" << std::endl;
		else
		std::cout << "\n\t Unknown OpenMP version: " << _OPENMP << "\n" << std::endl;
		#else
		std::cout << "\n\t OpenMP is not supported by this compiler.\n" << std::endl;
		#endif

		return 0;
		}