Presentation is loading. Please wait.

Presentation is loading. Please wait.

GMAC Global Memory for Accelerators Isaac Gelado, John E. Stone, Javier Cabezas, Nacho Navarro and Wen-mei W. Hwu GTC 2010.

Similar presentations


Presentation on theme: "GMAC Global Memory for Accelerators Isaac Gelado, John E. Stone, Javier Cabezas, Nacho Navarro and Wen-mei W. Hwu GTC 2010."— Presentation transcript:

1 GMAC Global Memory for Accelerators Isaac Gelado, John E. Stone, Javier Cabezas, Nacho Navarro and Wen-mei W. Hwu GTC 2010

2 GMAC in a nutshell GMAC: Unified Virtual Address Space for CUDA – Simplifies the CPU code – Exploits advanced CUDA features for free Vector addition example – Really simple kernel code – But, what about the CPU code? __global__ void vector(float *c, float *a, float *b, size_t size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if(idx < size) c[idx] = a[idx] + b[idx]; } 9/23/102 GTC 2010

3 CPU CUDA code (I) Read from disk, transfer to GPU and compute int main(int argc, char *argv[]) { float *h_a, *h_b, *h_c, *d_a, *d_b, *d_c; size_t size = LENGTH * sizeof(float); assert((h_a = malloc(size) != NULL); assert((h_b = malloc(size) != NULL); assert((h_c = malloc(size) != NULL); assert(cudaMalloc((void **)&d_a, size) == cudaSuccess)); assert(cudaMalloc((void **)&d_b, size) == cudaSuccess)); assert(cudaMalloc((void **)&d_c, size) == cudaSuccess)); read_file(argv[A], h_a); read_file(argv[B], h_b); assert(cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice) == cudaSuccess); assert(cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice) == cudaSuccess); 9/23/103 GTC 2010

4 CPU CUDA code (and II) Read from disk, transfer to GPU and compute Db(BLOCK_SIZE); Dg(LENGTH / BLOCK_SIZE); if(LENGTH % BLOCK_SIZE) Dg.x++; vector >>(d_c, d_a, d_b, LENGTH); assert(cudaThreadSynchronize() == cudaSuccess); assert(cudaMemcpy(d_c, h_c, LENGTH * sizeof(float), cudaMemcpyDeviceToHost) == cudaSuccess); save_file(argv[C], h_c); free(h_a); cudaFree(d_a); free(h_b); cudaFree(d_b); free(h_c); cudaFree(d_c); return 0; } 9/23/104 GTC 2010

5 CPU GMAC code 9/23/105 GTC 2010 int main(int argc, char *argv[]) { float *a, *b, *c; size_t size = LENGTH * sizeof(float); assert(gmacMalloc((void **)&a, size) == gmacSuccess)); assert(gmacMalloc((void **)&b, size) == gmacSuccess)); assert(gmacMalloc((void **)&c, size) == gmacSuccess)); read_file(argv[A], a); read_file(argv[B], b); Db(BLOCK_SIZE); Dg(LENGTH / BLOCK_SIZE); if(LENGTH % BLOCK_SIZE) Dg.x++; vector >>(c, a, b, LENGTH); assert(gmacThreadSynchronize() == gmacSuccess); save_file(argv[C], c); gmacFree(a); gmacFree(b); gmacFree(c); return 0; } There is no memory copy

6 Getting GMAC GMAC is at http://adsm.googlecode.com/http://adsm.googlecode.com/ Debian / Ubuntu binary and development.deb files UNIX (also MacOS X) source code package Experimental versions from mercurial repository 9/23/106 GTC 2010

7 Outline Introduction GMAC Memory Model – Asymmetric Memory – Global Memory GMAC Execution Model – Multi-threading – Inter-thread communication Conclusions 9/23/107 GTC 2010

8 GMAC Memory Model Unified CPU / GPU virtual address space Asymmetric address space accessibility CPU Memory GPU Shared Data CPU Data 9/23/108 GTC 2010

9 GMAC Consistency Model Implicit acquire / release primitives at accelerator call / return boundaries 9/23/10 GTC 2010 9 CPU ACC CPU ACC

10 GMAC Memory API Allocate shared memory gmacError_t gmacMalloc(void **ptr, size_t size) – Allocated memory address (returned by reference) – Gets the size of the data to be allocated – Error code, gmacSuccess if no error Example usage #include int main(int argc, char *argv[]) { float *foo = NULL; gmacError_t error; if((error = gmacMalloc((void **)&foo, FOO_SIZE)) != gmacSuccess) FATAL(“Error allocating memory %s”, gmacErrorString(error));... } 9/23/1010 GTC 2010

11 GMAC Memory API Release shared memory gmacError_t gmacFree(void *ptr) – Memory address to be released – Error code, gmacSuccess if no error Example usage #include int main(int argc, char *argv[]) { float *foo = NULL; gmacError_t error; if((error = gmacMalloc((void **)&foo, FOO_SIZE)) != gmacSuccess) FATAL(“Error allocating memory %s”, gmacErrorString(error));... gmacFree(foo); } 9/23/1011 GTC 2010

12 GMAC Unified Address Space Use fixed-size segments to map accelerator memory Implement and export Accelerator Virtual Memory Accelerator System Memory CPU Accelerator Memory 0x00100000 9/23/1012 GTC 2010

13 GMAC Memory API Translate shared memory (multi-GPU) void *gmacPtr(void *ptr) template T *gmacPtr(T *ptr) – Receives CPU memory address – Returns GPU memory address Example usage #include int main(int argc, char *argv[]) {... kernel >>(gmacPtr(buffer), size);... } 9/23/1013 GTC 2010

14 GMAC Example Code (I) int fdtd(FILE *fpMat, FILE *fpMed, int N) { /* Read and create data structures */ MaterialList materials if(readMaterials(fpMat, materials) == 0) return -1; Media media; if(readMedia(fpMed, media) == 0) return -1; Field field; if(createField(media.dim, field) == 0) return -1; for(int n = 0; n < N; n++) {... updateElectic >>(materials, media, field);... n++; updateMagnetic >>(materials, media, field);... } 9/23/10 GTC 2010 14

15 GMAC Example Code (II) typedef struct { float Ke[3][3], km[3][3]; } Material; typedef struct { size_t n; Material *data; } MaterialList; /* Read materials from disk */ size_t readMaterials(FILE *fp, MaterialList &list) { uint16_t n = 0; fread(&n, sizeof(n), 1, fp); ret = gmacMalloc((void **)&list.data, n * sizeof(Material)); if(ret != gmacSuccess) return 0; fread(list.data, sizeof(Material), n, fp); return n; } /* Read media description from file */ typedef struct { dim3 dim; uint16_t *data } Media; void readMedia(FILE *fp, Media &media); /* Allocate a electromagnetic field */ typedef struct { dim3 dim; float3 *e; float3 *h; float3 *p; float3 *m } Field; void allocateField(Field &f, dim3 dim); 9/23/10 GTC 2010 15

16 GMAC I/O Handling Functions overridden (interposition) by GMAC: – Memory: memset(), memcpy() – I/O: fread(), fwrite(), read(), write() – MPI: MPI_Send(), MPI_Receive Get advanced CUDA features for free – Asynchronous data transfers – Pinned memory 9/23/10 GTC 2010 16 Asynchronous Copies to device memory Pinned memory for I/O transfers

17 GMAC Example Code (III) __global__ void updateElectric(Materials mats, Media media, Field f) { int Idx = threadIdx.x + blockDim.x * blockIdx.x; int Idy = threadIdx.y + blockDim.y * blockIdx.y; for(int Idz = 0; Idz < f.dim.z; Idz++) { int pos = Idx + Idy * f.dim.x + Idz * f.dim.x * f.dim.y; float3 E = f.e[pos]; Material m = mats[media[pos]]; float3 P; P.x = E.x * m.ke[0][0] + E.y * m.ke[0][1] + E.z * m.ke[0][2]; P.y = E.x * m.ke[1][0] + E.y * m.ke[1][1] + E.z * m.ke[1][2]; P.z = E.x * m.ke[2][0] + E.y * m.ke[2][1] + E.z * m.ke[2][2]; f.p[pos] = P; } 9/23/10 GTC 2010 17

18 Outline Introduction GMAC Memory Model – Asymmetric Memory – Global Memory GMAC Execution Model – Multi-threading – Inter-thread communication Conclusions 9/23/1018 GTC 2010

19 GMAC Global Memory For multi-GPU systems Data accessible by all accelerators, but owned by the CPU CPU Memory GPU 9/23/1019 GTC 2010

20 GMAC Global memory API Allocate global shared Memory gmacError_t gmacGlobalMalloc(void **ptr, size_t size) – Allocated memory address (returned by reference) – Gets the size of the data to be allocated – Error code, gmacSuccess if no error Example usage #include int main(int argc, char *argv[]) { float *foo = NULL; gmacError_t error; if((error = gmacGlobalMalloc((void **)&foo, FOO_SIZE)) != gmacSuccess) FATAL(“Error allocating memory %s”, gmacErrorString(error));... } 9/23/1020 GTC 2010

21 GMAC Example Code (I) typedef struct { float Ke[3][3], km[3][3]; } Material; typedef struct { size_t n; Material *data; } MaterialList; /* Read materials from disk */ size_t readMaterials(FILE *fp, MaterialList &list) { uint16_t n = 0; fread(&n, sizeof(n), 1, fp); ret = gmacGlobalMalloc((void **)&list.data, n * sizeof(Material)); if(ret != gmacSuccess) return 0; fread(list.data, sizeof(Material), n, fp); return n; } /* Read media description from file */ typedef struct { dim3 dim; uint16_t *data } Media; void readMedia(FILE *fp, Media &media); /* Allocate a electromagnetic field */ typedef struct { dim3 dim; float3 *e; float3 *h; float3 *p; float3 *m } Field; void allocateField(Field &f, dim3 dim); 9/23/10 GTC 2010 21

22 Outline Introduction GMAC Memory Model – Asymmetric Memory – Global Memory GMAC Execution Model – Multi-threading – Inter-thread communication Conclusions 9/23/1022 GTC 2010

23 GMAC and Multi-threading In the past, one host thread had one CPU In GMAC, each host thread has: – One CPU – One GPU A GMAC thread is running at GPU or at the CPU, but not in both at the same time Create threads using what you already know – pthread_create(...) 9/23/1023 GTC 2010

24 GMAC and Multi-threading Virtual memory accessibility: – Complete address space in CPU mode – Partial address space in GPU mode CPU GPU Memory 9/23/1024 GTC 2010

25 Getting Full-duplex PCIe Use multi-threading to fully utilize the PCIe – One CPU thread launch kernels – One CPU thread writes to shared memory – Once CPU thread reads from shared memory 9/23/10 GTC 2010 25 GPU CPU GPU Memory System Memory PCIe

26 Outline Introduction GMAC Memory Model – Asymmetric Memory – Global Memory GMAC Execution Model – Multi-threading – Inter-thread communication Conclusions 9/23/1026 GTC 2010

27 GPU Handoff and Copying GPU handoff: – Send the thread’s virtual GPU to another thread – Do not move data, move computation API Calls – Virtual GPU sending gmacError_t gmacSend(thread_id dest) – Virtual GPU receiving gmacError_t gmacReceive() – Virtual GPU copying gmacError_t gmacCopy(thread_id dest) 9/23/1027 GTC 2010

28 GPU virtual GPUs use Case Exploit data locality in the CPU and GPU Example: MPEG-4 Encoder: – Each GMAC thread executes one stage – Then, moves to the GPU where the input data is 9/23/10 GTC 2010 28 GPU DCT and Quantization Motion Compensation Dequantization and IDCT Motion Estimation

29 Outline Introduction GMAC Memory Model – Asymmetric Memory – Global Memory GMAC Execution Model – Multi-threading – Inter-thread communication Conclusions 9/23/1029 GTC 2010

30 GMAC Performance 9/23/10 GTC 2010 30

31 GMAC on Actual Applications (I) Reverse Time Migration (BSC / Repsol) – Six months – one programmer – Currently in use by Repsol Single-GPU using CUDA Run-time – Can live with it: double-allocations, memory consistency – Nightmare: overlap GPU computation and data transfers (CUDA streams and double-buffering with pinned memory) Multi-GPU using CUDA Run-time – Can live with it: lack of IDE for Linux – Nightmare: everything else 9/23/10 GTC 2010 31

32 GMAC on Actual Applications (II) Multi-GPU using GMAC: – Double-buffering and pinned memory for free Disk transfers GPU to GPU (inter-domain) communication MPI communication – Clean threading model One task per CPU thread Well-know synchronization primitives It took shorter than the single-GPU version 9/23/10 GTC 2010 32

33 Conclusions Single virtual address space for CPUs and GPUs Use CUDA advanced features – Automatic overlap data communication and computation – Get access to any GPU from any CPU thread Get more performance from your application more easily Go: http://adsm.googlecode.comhttp://adsm.googlecode.com 9/23/1033 GTC 2010

34 Future Features OpenCL and Windows 7 support coming soon Data-dependence tracking: – Avoid transferring data to the GPU when not used by kernels – Avoid transferring data to the CPU when not modified kernels Global shared memory partitioning between multiple GPUs 9/23/10 GTC 2010 34

35 GMAC Global Memory for Accelerators http://adsm.googlecode.com

36 Backup Slides

37 GMAC Advanced Free Features Get advanced CUDA features for free – Asynchronous data transfers – Pinned memory 9/23/1037 GTC 2010 Asynchronous Copies to device memory Pinned memory for I/O transfers

38 GMAC Unified Address Space When allocating memory 1.Allocate accelerator memory 2.Allocate CPU memory at the same virtual address CPU Accelerator System Memory Accelerator Memory 9/23/1038 GTC 2010

39 Lazy Update Data Transfers Avoid unnecessary data copies Lazy-update: – Call: transfer modified data – Return: transfer when needed 9/23/10 GTC 2010 39 Accelerator System Memory Accelerator Memory CPU

40 Rolling Update Data Transfers Overlap CPU execution and data transfers Minimal transfer on-demand Rolling-update: – Memory-block size granularity 9/23/10 GTC 2010 40 Accelerator System Memory Accelerator Memory CPU

41 GMAC Global Memory for Accelerators http://adsm.googlecode.com


Download ppt "GMAC Global Memory for Accelerators Isaac Gelado, John E. Stone, Javier Cabezas, Nacho Navarro and Wen-mei W. Hwu GTC 2010."

Similar presentations


Ads by Google