C++ PPL AMP
When NO branches between a micro-op and retiring to the visible architectural state – its no longer speculative foo; 190: r4 = 0
… FAST VECTOR/PARALLEL LOO P … ELSE … SEQUENTIAL LOOP …
B[0] B[1] B[2] B[3] A[0] A[1] A[2] A[3] A[0] + B[0] A[1] + B[1] A[2] + B[2] A[3] + B[3] xmm0 “addps xmm1, xmm0 “ xmm1 +
+ r1 r2 r3 add r3, r1, r2 SCALAR (1 operation) v1 v2 v3 + vector length vadd v3, v1, v2 VECTOR (N operations)
… float x = input[threadID]; float y = func(x); output[threadID] = y; … threadID Arrays of Parallel Threads - SPMD All threads run the same code (SPMD) Each thread has an ID that it uses to compute memory addresses and make control decisions
for (i = 0; i < 1000/4; i++){ movps xmm0, [ecx] movps xmm1, [eax] addps xmm0, xmm1 movps [edx], xmm0 } for (i = 0; i < 1000; i++) A[i] = B[i] + C[i]; Compiler look across loop iterations !
C++ or Klingon
B[0] B[1] B[2] B[3] A[0] A[1] A[2] A[3] A[0] + B[0] A[1] + B[1] A[2] + B[2] A[3] + B[3] xmm0 “addps xmm1, xmm0 “ xmm1 +
A(3) = ?
−ALL loads before ALL stores A (2:4) = A (1:4) + A (3:7) VR1 = LOAD(A(1:4)) VR2 = LOAD(A(3:7)) VR3 = VR1 + VR2 // A(3) = F (A(2) A(4)) STORE(A(2:4)) = VR3
Instead - load store load store... Instead - load store load store... FOR ( j = 2; j <= 257; j++) A( j ) = A( j-1 ) + A( j+1 ) A(2) = A(1) + A(3) A(3) = A(2) + A(4) // A(3) = F ( A(1)A(2)A(3)A(4) ) A(4) = A(3) + A(5) A(5) = A(4) + A(6) … …
A ( a1 * I + c1 ) ?= A ( a2 * I’ + c2)
Complex C++ Not just arrays!
void foo() { #pragma loop(hint_parallel(4)) for (int i=0; i<1000; i++) A[i] = B[i] + C[i]; } void foo() { CompilerParForLib(0, 1000, 4, &foo$par1, A, B, C); } foo$par1(int T1, int T2, int *A, int *B, int *C) { for (int i=T1; i<T2; i+=4) movps xmm0, [ecx] movps xmm1, [eax] addps xmm0, xmm1 movps [edx], xmm0 } Parallelism + vector foo$par1(0, 249, A, B, C);core 1 instr foo$par1(250, 499, A, B, C);core 2 instr foo$par1(500, 749, A, B, C);core 3 instr foo$par1(750, 999, A, B, C);core 4 instr Runtime Vectorized + and parallel
dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; } for (k = 1; k <= M; k++) { dc[k] = dc[k-1] + tpdd[k-1]; if ((sc = mc[k-1] + tpmd[k-1]) > dc[k]) dc[k] = sc; if (dc[k] < -INFTY) dc[k] = -INFTY; for (k = 1; k <= M; k++) { if (k < M) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; } for (k = 1; k < M; k++) { ic[k] = mpp[k] + tpmi[k]; if ((sc = ip[k] + tpii[k]) > ic[k]) ic[k] = sc; ic[k] += is[k]; if (ic[k] < -INFTY) ic[k] = -INFTY; }
“pmax xmm1, xmm0 “ pmax A[0] > B[0] ? A[0] : B[0] A[1]> B[1] ? A[1] : B[1] A[2] > B[2] ? A[2] : B[2] A[3] > B[3] ? A[3] : B:[3] B[0] B[1] B[2] B[3] A[0] A[1] A[2] A[3] xmm1 xmm0
if ( __isa_availablility > SSE2 && NO_ALIASISIN ) { Vector loop Scalar loop Vector loop
15x faster
for (i=0; i<n; i++) { a[i] = a[i] + b[i]; a[i] = sin(a[i]); } for(i=0; i<n; i=i+VL) { a(i : i+VL-1) = a(i : i+VL-1) + b(i : i+VL-1); a(i : i+VL-1) = _svml_Sin(a(i : i+VL-1)); } NEW Run-Time Library HW SIMD instruction
Foo (float *a, float *b, float *c) { #pragma loop(hint_parallel(N)) for (auto i=0; i<N; i++) { *c++ = (*a++) * bar(b++); }; Pointers and procedure calls with escaped pointers prevent analysis for auto- parallelization Use simple directives Pragma
for (int l = top; l < bottom; l++){ for (int m = left; m < right; m++ ){ int y = *(blurredImage + (l*dimX) +m); ySourceRed += (unsigned int) (y & 0x00FF0000) >> 16; ySourceGreen += (unsigned int) (y & 0x0000ff00) >> 8; ySourceBlue += (unsigned int) (y & 0x000000FF); averageCount++; }
{ if (iter >= maxIter) *a=0xFF000000; //black else //a gradient from red to yellow { unsigned short redYellowComponent = ~(iter * 32000/maxIter) ; unsigned int xx = 0x00FFF000; //0 Alpha + RED xx = xx|redYellowComponent; xx <<= 8; *a = xx; }
1.#include int main() 5.{ 6. int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'}; for (int idx = 0; idx < 11; idx++) 9. { 10. v[idx] += 1; 11. } 12. for(unsigned int i = 0; i < 11; i++) 13. std::cout ( v[i]); 14. }
1.#include 2.#include 3.using namespace concurrency; 4.int main() 5.{ 6. int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'}; for (int idx = 0; idx < 11; idx++) 9. { 10. v[idx] += 1; 11. } 12. for(unsigned int i = 0; i < 11; i++) 13. std::cout ( v[i]); 14. } amp.h: header for C++ AMP library concurrency: namespace for library amp.h: header for C++ AMP library concurrency: namespace for library
1.#include 2.#include 3.using namespace concurrency; 4.int main() 5.{ 6. int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'}; 7. array_view av(11, v); 8. for (int idx = 0; idx < 11; idx++) 9. { 10. v[idx] += 1; 11. } 12. for(unsigned int i = 0; i < 11; i++) 13. std::cout ( v[i]); 14. } array_view: wraps the data to operate on the accelerator. array_view variables captured and associated data copied to accelerator (on demand)
1.#include 2.#include 3.using namespace concurrency; 4.int main() 5.{ 6. int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'}; 7. array_view av(11, v); 8. for (int idx = 0; idx < 11; idx++) 9. { 10. av[idx] += 1; 11. } 12. for(unsigned int i = 0; i < 11; i++) 13. std::cout ( av[i]); 14. } array_view: wraps the data to operate on the accelerator. array_view variables captured and associated data copied to accelerator (on demand)
C++ AMP “Hello World” File -> New -> Project Empty Project Project -> Add New Item Empty C++ file 1.#include 2.#include 3.using namespace concurrency; 4.int main() 5.{ 6. int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'}; 7. array_view av(11, v); 8. parallel_for_each(av.extent, [=](index idx) restrict(amp) 9. { 10. av[idx] += 1; 11. }); 12. for(unsigned int i = 0; i < 11; i++) 13. std::cout (av[i]); 14. } parallel_for_each: execute the lambda on the accelerator once per thread extent: the parallel loop bounds or “shape” index: the thread ID that is running the lambda, used to index into data
C++ AMP “Hello World” File -> New -> Project Empty Project Project -> Add New Item Empty C++ file 1.#include 2.#include 3.using namespace concurrency; 4.int main() 5.{ 6. int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'}; 7. array_view av(11, v); 8. parallel_for_each(av.extent, [=](index idx) restrict(amp) 9. { 10. av[idx] += 1; 11. }); 12. for(unsigned int i = 0; i < 11; i++) 13. std::cout (av[i]); 14. } restrict(amp): tells the compiler to check that code conforms to C++ subset, and tells compiler to target GPU
C++ AMP “Hello World” File -> New -> Project Empty Project Project -> Add New Item Empty C++ file 1.#include 2.#include 3.using namespace concurrency; 4.int main() 5.{ 6. int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'}; 7. array_view av(11, v); 8. parallel_for_each(av.extent, [=](index idx) restrict(amp) 9. { 10. av[idx] += 1; 11. }); 12. for(unsigned int i = 0; i < 11; i++) 13. std::cout (av[i]); 14. } array_view: automatically copied to accelerator if required array_view: automatically copied back to host when and if required
32nm 22nm 22nm 14nm 10nm 256 bit AVX(2)256 bit AVX 128 bit SSE You are here (3D tri-state transistors)