Presentation is loading. Please wait.

Presentation is loading. Please wait.

CUDA and OpenCL Kernels

Similar presentations


Presentation on theme: "CUDA and OpenCL Kernels"— Presentation transcript:

1 CUDA and OpenCL Kernels
Workshop on Reduction Optimizing Simple CUDA and OpenCL Kernels Chris Szalwinski December

2 Overview Reduction Algorithm Terminology – CUDA and OpenCL
Test Problem – Dot Product Test Parameters Execution Configuration Divergence Register Storage

3 Reduction Algorithm Select the Largest Value

4 Minimum - Naive Reduction

5 Terminology - Configuration
CUDA Grid Blocks Threads OpenCL Workspace Workgroups Work Items

6 Terminology - Memory CUDA Global Shared Local OpenCL Global Local
Private

7 Dot Product Vector a – n elements – a[0] a[1] … a[n-1]
Vector b – n elements – b[0] b[1] … b[n-1] Dot Product = a[0] * b[0] + a[1] * b[1] + … + a[n-1] * b[n-1] 2 stages Multiply matching elements Sum their products

8 CUDA – Naive Dot Product
const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_D(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = 1; stride < blockDim.x; stride *= 2) { if (tid % (2 * stride) == 0) s[tid] += s[tid + stride]; } if (tid == 0) c[blockIdx.x] = s[0];

9 OpenCL – Naive Dot Product
#define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_D(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = 1; stride < size; stride *= 2) { if (i % (2 * stride) == 0) s[i] += s[i + stride]; } if (i == 0) c[wgrp] = s[0];

10 Minimize Divergence

11 CUDA – Minimize Divergence
const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_DM(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) s[tid] += s[tid + stride]; } if (tid == 0) c[blockIdx.x] = s[0];

12 OpenCL – Minimize Divergence
#define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DM(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) s[i] += s[i + stride]; } if (i == 0) c[wgrp] = s[0];

13 CUDA – Register Accumulator
const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_DMR(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; float x = 0; // store product in shared memory if (gid < n) x = s[tid] = a[gid] * b[gid]; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) { x += s[tid + stride]; s[tid] = x; } if (tid == 0) c[blockIdx.x] = x;

14 OpenCL – Register Accumulator
#define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DMR(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory float x = 0; if (gid < n) x = s[i] = a[gid] * b[gid]; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) { x += s[i + stride]; s[i] = x; } if (i == 0) c[wgrp] = x;

15 CUDA Results

16 OpenCL Results


Download ppt "CUDA and OpenCL Kernels"

Similar presentations


Ads by Google