Presentation is loading. Please wait.

Presentation is loading. Please wait.

Programming of multiple GPUs with CUDA and Qt library

Similar presentations


Presentation on theme: "Programming of multiple GPUs with CUDA and Qt library"— Presentation transcript:

1 Programming of multiple GPUs with CUDA and Qt library
Lecture Alexey Abramov abramov _at_ physik3.gwdg.de Georg-August University, Bernstein Center for Computational Neuroscience, III Physikalisches Institut, Göttingen, Germany

2 Alexey Abramov (BCCN, Göttingen)
Multi-GPU programming A host system can have multiple devices. Several host threads can execute device code on the same device, but by design, a host thread can execute device code on only one device at any given time. As a consequence, multiple host threads are required to execute device code on multiple devices. Alexey Abramov (BCCN, Göttingen) /21

3 Alexey Abramov (BCCN, Göttingen)
Multi-GPU programming In order to issue work to a GPU, a context is established between a CPU thread and the GPU. Only one context can be active on GPU at a time. Alexey Abramov (BCCN, Göttingen) /21

4 Alexey Abramov (BCCN, Göttingen)
Multi-GPU programming Even though a GPU can execute calls from one context at a time, it can belong to multiple contexts. For example, it is possible for several CPU threads to establish contexts with the same GPU. Alexey Abramov (BCCN, Göttingen) /21

5 Multi-GPU programming
A host thread can execute device code on only one device at any given time. (it will be possible in CUDA 4.0) Alexey Abramov (BCCN, Göttingen) /21

6 Alexey Abramov (BCCN, Göttingen)
#include <stdlib.h> #include <stdio.h> #include <math.h> #include <multithreading.h> #include <cutil_inline.h> #include <cuda_runtime_api.h> #include "simpleMultiGPU.h" typedef struct {   // Device id   int device;   // Host-side input data   int dataN;   float *h_Data;   // Partial sum for this GPU   float *h_Sum; } TGPUplan; Alexey Abramov (BCCN, Göttingen) /21

7 Alexey Abramov (BCCN, Göttingen)
// Data configuration const int MAX_GPU_COUNT = 32; const int DATA_N        = * 32; int main(int argc, char **argv){ // Solver config TGPUplan plan[MAX_GPU_COUNT]; // GPU reduction results float h_SumGPU[MAX_GPU_COUNT]; bzero(h_SumGPU, MAX_GPU_COUNT * sizeof(float)); // OS thread ID CUTThread threadID[MAX_GPU_COUNT]; // create a timer to measure runtime unsigned int hTimer; cutCreateTimer(&hTimer); Alexey Abramov (BCCN, Göttingen) /21

8 Alexey Abramov (BCCN, Göttingen)
// get number of available CUDA-capable devices   int deviceCount = 0;   cudaGetDeviceCount(&deviceCount);   if(deviceCount > MAX_GPU_COUNT)     deviceCount = MAX_GPU_COUNT;   printf("CUDA-capable device count: %i\n", deviceCount); printf("Generating input data...\n\n");   float *h_Data = (float *)malloc(DATA_N * sizeof(float));   for(int i = 0; i < DATA_N; i++)     h_Data[i] = (float)rand() / (float)RAND_MAX;   // subdividing input data across GPUs   // get data sizes for each GPU   for(int i = 0; i < deviceCount; i++)     plan[i].dataN = DATA_N / deviceCount; Alexey Abramov (BCCN, Göttingen) /21

9 Alexey Abramov (BCCN, Göttingen)
  // take into account "odd" data sizes   for(int i = 0; i < DATA_N % deviceCount; i++)     plan[i].dataN++;   // assign data ranges to GPUs   int gpuBase = 0;   for(int i = 0; i < deviceCount; i++){     plan[i].device = i;     plan[i].h_Data = h_Data + gpuBase;     plan[i].h_Sum = h_SumGPU + i;     gpuBase += plan[i].dataN;   }   // start timing and compute on GPU(s)   printf("Computing with %d GPU's...\n", deviceCount);   cutResetTimer(hTimer);   cutStartTimer(hTimer); Alexey Abramov (BCCN, Göttingen) /21

10 Alexey Abramov (BCCN, Göttingen)
  // create deviceCount threads   for(int i = 0; i < deviceCount; i++)     threadID[i] = cutStartThread((CUT_THREADROUTINE)solverThread, (void*) (plan + i));   cutWaitForThreads(threadID, deviceCount);   float sumGPU = 0;   // get the final sum   for(int i = 0; i < deviceCount; i++)     sumGPU += h_SumGPU[i];   cutStopTimer(hTimer);   printf("GPU Processing time: %f (ms)\n\n", cutGetTimerValue(hTimer)); Alexey Abramov (BCCN, Göttingen) /21

11 Alexey Abramov (BCCN, Göttingen)
// compute on Host CPU   printf("Computing with Host CPU...\n\n");   double sumCPU = 0;   for(int i = 0; i < DATA_N; i++)     sumCPU += h_Data[i];   // compare GPU and CPU results   printf("Comparing GPU and Host CPU results...\n");   double diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);   printf("  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU);   printf("  Relative difference: %E \n\n", diff);   printf((diff < 1e-5) ? "PASSED\n\n" : "FAILED\n\n");   // cleanup and shutdown   printf("Shutting down...\n");   cutDeleteTimer(hTimer);   free(h_Data); cudaThreadExit(); Alexey Abramov (BCCN, Göttingen) /21

12 Alexey Abramov (BCCN, Göttingen)
static CUT_THREADPROC solverThread(TGPUplan *plan){   const int  BLOCK_N = 32;   const int THREAD_N = 256;   const int  ACCUM_N = BLOCK_N * THREAD_N;   float *d_Data,*d_Sum;   float *h_Sum;   float sum;   int i;   // set device   cudaSetDevice(plan->device);   // allocate memory   cudaMalloc((void**)&d_Data, plan->dataN * sizeof(float));   cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float));   h_Sum = (float *)malloc(ACCUM_N * sizeof(float); Alexey Abramov (BCCN, Göttingen) /21

13 Alexey Abramov (BCCN, Göttingen)
  // copy input data from CPU   cudaMemcpy(d_Data, plan->h_Data, plan->dataN * sizeof(float), cudaMemcpyHostToDevice);   // perform GPU computations   launch_reduceKernel(d_Sum, d_Data, plan->dataN, BLOCK_N, THREAD_N);   // read back GPU results   cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost) );   sum = 0;   for(i = 0; i < ACCUM_N; i++)     sum += h_Sum[i];   *(plan->h_Sum) = (float)sum;   // shut down this GPU   free(h_Sum);   cudaFree(d_Sum);   cudaFree(d_Data);   CUT_THREADEND; } Alexey Abramov (BCCN, Göttingen) /21

14 Alexey Abramov (BCCN, Göttingen)
void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N) {   reduceKernel<<<BLOCK_N, THREAD_N>>>(d_Result, d_Input, N) cudaThreadSynchronize(); } __global__ static void reduceKernel(float *d_Result, float *d_Input, int N){   const int tid = blockIdx.x * blockDim.x + threadIdx.x;   const int threadN = gridDim.x * blockDim.x;   float sum = 0;   for(int pos = tid; pos < N; pos += threadN)     sum += d_Input[pos];   d_Result[tid] = sum;   Alexey Abramov (BCCN, Göttingen) /21

15 Alexey Abramov (BCCN, Göttingen)
QThread class for multi-GPU programming The QThread class provides platform-independent threads. class QThread; // class for Qt thread with a GPU context class CDeviceThread: public QThread{ private:     TGPUplan *m_pPlan; protected: void run(); public:     CDeviceThread(){};     ~CDeviceThread(){};     void Init(TGPUplan *plan){ m_pPlan = plan; }; }; Alexey Abramov (BCCN, Göttingen) /21

16 Alexey Abramov (BCCN, Göttingen)
int main(int argc, char **argv){ CDeviceThread *pThreads[MAX_GPU_COUNT];   // create deviceCount threads   for(int i = 0; i < deviceCount; i++){             CDeviceThread *pDevice = new CDeviceThread;       pDevice->Init(plan+i);       pThreads[i] = pDevice;   }   // start threads   for(int i = 0; i < deviceCount; i++)       pThreads[i]->start();   // wait for threads   for(int i = 0; i < deviceCount; i++)       pThreads[i]->wait(); Alexey Abramov (BCCN, Göttingen) /21

17 Alexey Abramov (BCCN, Göttingen)
  // cleanup for(int i = 0; i < deviceCount; i++) delete pThreads[i]; } void CDeviceThread::run(){   std::cout << "CDeviceThread thread ID = " << QThread::currentThreadId() << std::endl;   std::cout << "Device = " << m_pPlan->device << std::endl;   std::cout << "DataN = " << m_pPlan->dataN << std::endl;   const int  BLOCK_N = 32;   const int THREAD_N = 256;   const int  ACCUM_N = BLOCK_N * THREAD_N;   float *d_Data,*d_Sum;   float *h_Sum;   float sum; Alexey Abramov (BCCN, Göttingen) /21

18 Alexey Abramov (BCCN, Göttingen)
int i; // set device cudaSetDevice(m_pPlan->device); // allocate memory cudaMalloc((void**)&d_Data, m_pPlan->dataN * sizeof(float)); cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float)); h_Sum = (float *)malloc(ACCUM_N * sizeof(float)); // copy input data from CPU cudaMemcpy(d_Data, m_pPlan->h_Data, m_pPlan->dataN * sizeof(float), cudaMemcpyHostToDevice); // perform GPU computations launch_reduceKernel(d_Sum, d_Data, m_pPlan->dataN, BLOCK_N, THREAD_N); Alexey Abramov (BCCN, Göttingen) /21

19 Alexey Abramov (BCCN, Göttingen)
// read back GPU results cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost); // finalize GPU reduction for current subvector sum = 0; for(i = 0; i < ACCUM_N; i++) sum += h_Sum[i]; *(m_pPlan->h_Sum) = (float)sum; // shut down this GPU free(h_Sum); cudaFree(d_Sum); cudaFree(d_Data); } Alexey Abramov (BCCN, Göttingen) /21

20 Alexey Abramov (BCCN, Göttingen)
Bibliography NVIDIA CUDA Programming Guide CUDA C Best Practices Guide Qt documentation Alexey Abramov (BCCN, Göttingen) /21

21 Thank you for your attention !
QUESTIONS ? Göttingen,


Download ppt "Programming of multiple GPUs with CUDA and Qt library"

Similar presentations


Ads by Google