Presentation is loading. Please wait.

Presentation is loading. Please wait.

EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB.py OS/HW f() h() Specializer.c PLL Interp Productivity app.so cc/ld $ $ SEJITS logical.

Similar presentations


Presentation on theme: "EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB.py OS/HW f() h() Specializer.c PLL Interp Productivity app.so cc/ld $ $ SEJITS logical."— Presentation transcript:

1 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB.py OS/HW f() h() Specializer.c PLL Interp Productivity app.so cc/ld $ $ SEJITS logical flow Selective Embedded JIT ASP.py Specialization Human-expert- authored templates & strategy engine

2 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB.py OS/HW f() h() Specializer.c PLL Interp Productivity app.so cc/ld $ $ SEJITS logical flow ASP.py Asp DB

3 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB Stencil example: input  Sum the distance-1 neighbors in a 2D stencil 3 import stencil_grid import numpy class ExampleKernel(object): def kernel(self, in_grid, out_grid): for x in out_grid.interior_points(): for y in in_grid.neighbors(x, 1): out_grid[x] = out_grid[x] + in_grid[y] in_grid = StencilGrid([5,5]) in_grid.data = numpy.ones([5,5]) out_grid = StencilGrid([5,5]) ExampleKernel().kernel(in_grid, out_grid) from stencil_kernel import *

4 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB Neighbor iterator expanded Kernel function lowered from Python to C++ Specialized to dimensions of input data Stencil kernel example: unoptimized sequential C++ #include #include "arrayobject.h" namespace private_namespace_000 { void kernel(PyObject *in_grid, PyObject *out_grid) { for (int i = 1; i < 4; i++) { for (int j = 1; j < 4; j++) { #define _out_grid_array_macro(_d0,_d1) ((_d1) + 5 * (_d0)) #define _in_grid_array_macro(_d0,_d1) ((_d1) + 5 * (_d0)) double* _my_out_grid = (double *) PyArray_DATA(out_grid); double* _my_in_grid = (double *) PyArray_DATA(in_grid); int idxout = _out_grid_array_macro(i,j); int idxin = _in_grid_array_macro(i+1,j); _my_out_grid[idxout] = (_my_out_grid[idxout] + _my_in_grid[idxin]); idxin = _in_grid_array_macro(i-1,j); _my_out_grid[idxout] = (_my_out_grid[idxout] + _my_in_grid[idxin]); idxin = _in_grid_array_macro(i,j+1); _my_out_grid[idxout] = (_my_out_grid[idxout] + _my_in_grid[idxin]); idxin = _in_grid_array_macro(i,j-1); _my_out_grid[idxout] = (_my_out_grid[idxout] + _my_in_grid[idxin]); }

5 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB C++ with loop unrolling by 4 void kernel_unroll_4(PyObject *in_grid, PyObject *out_grid) { for (int _EwzZcfet = 1; (_EwzZcfet <= 256); _EwzZcfet = (_EwzZcfet + 1)) cilk_for (int _HmEYIyVk = 1; (_HmEYIyVk <= 256); _HmEYIyVk += 1) for (int _QXalOpkr = 1; (_QXalOpkr <= 256); _QXalOpkr = (_QXalOpkr + 4)) { #define _out_grid_array_macro(_d0,_d1,_d2) (((_d2) * (_d1)) * (_d0)) #define _in_grid_array_macro(_d0,_d1,_d2) (((_d2) * (_d1)) * (_d0)) double* _my_out_grid = (double *) PyArray_DATA(out_grid); double* _my_in_grid = (double *) PyArray_DATA(in_grid);; int x; x = _in_grid_array_macro(_EwzZcfet, _HmEYIyVk, _QXalOpkr); { int y; y = _in_grid_array_macro((_EwzZcfet + 1), (_HmEYIyVk + 0), (_QXalOpkr + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + -1), (_HmEYIyVk + 0), (_QXalOpkr + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 1), (_QXalOpkr + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + -1), (_QXalOpkr + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 0), (_QXalOpkr + 1)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 0), (_QXalOpkr + -1)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); } x = _in_grid_array_macro(_EwzZcfet, _HmEYIyVk, (_QXalOpkr+1)); { int y; y = _in_grid_array_macro((_EwzZcfet + 1), (_HmEYIyVk + 0), ((_QXalOpkr+1) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + -1), (_HmEYIyVk + 0), ((_QXalOpkr+1) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 1), ((_QXalOpkr+1) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + -1), ((_QXalOpkr+1) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 0), ((_QXalOpkr+1) + 1)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 0), ((_QXalOpkr+1) + -1)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); } x = _in_grid_array_macro(_EwzZcfet, _HmEYIyVk, (_QXalOpkr+2)); { int y; y = _in_grid_array_macro((_EwzZcfet + 1), (_HmEYIyVk + 0), ((_QXalOpkr+2) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + -1), (_HmEYIyVk + 0), ((_QXalOpkr+2) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 1), ((_QXalOpkr+2) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + -1), ((_QXalOpkr+2) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 0), ((_QXalOpkr+2) + 1)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 0), ((_QXalOpkr+2) + -1)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); } x = _in_grid_array_macro(_EwzZcfet, _HmEYIyVk, (_QXalOpkr+3)); { int y; y = _in_grid_array_macro((_EwzZcfet + 1), (_HmEYIyVk + 0), ((_QXalOpkr+3) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + -1), (_HmEYIyVk + 0), ((_QXalOpkr+3) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 1), ((_QXalOpkr+3) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + -1), ((_QXalOpkr+3) + 0)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 0), ((_QXalOpkr+3) + 1)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); y = _in_grid_array_macro((_EwzZcfet + 0), (_HmEYIyVk + 0), ((_QXalOpkr+3) + -1)); _my_out_grid[x] = (_my_out_grid[x] + ((1.0 / 6.0) * _my_in_grid[y])); } 5

6 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB Fully optimized, parallel C++ int c2; for (c2=chunkOffset_2;c2<=255;c2+=256) { int c1; for (c1=chunkOffset_1;c1<=255;c1+=8) { int c0; for (c0=chunkOffset_0;c0<=255;c0+=256) { int b2; for (b2=c2 + threadOffset_2;b2<=c ;b2+=256) { int b1; for (b1=c1 + threadOffset_1;b1<=c1 + 7;b1+=8) { int b0; for (b0=c0 + threadOffset_0;b0<=c ;b0+=256) { int k; for (k=b2 + 1;k<=b ;k+=2) { int j; for (j=b1 + 1;j<=b1 + 8;j+=2) { int i; for (i=b0 + 1;i<=b ;i+=8) { Anext[_Anext_Index(i - 1,j - 1,k - 1)] = fac * A0[_A0_Index(i - 1,j - 1,k - 1)] + fac2 * (A0[_A0_Index(i,j - 1,k - 1)] + A0[_A0_Index(i - 2,j - 1,k - 1)] + A0[_A0_Index(i - 1,j,k - 1)] + A0[_A0_Index(i - 1,j - 2,k - 1)] + A0[_A0_Index(i - 1,j - 1,k)] + A0[_A0_Index(i - 1,j - 1,k - 2)]); Anext[_Anext_Index(i,j - 1,k - 1)] = fac * A0[_A0_Index(i,j - 1,k - 1)] + fac2 * (A0[_A0_Index(i + 1,j - 1,k - 1)] + A0[_A0_Index(i - 1,j - 1,k - 1)] + A0[_A0_Index(i,j,k - 1)] + A0[_A0_Index(i,j - 2,k - 1)] + A0[_A0_Index(i,j - 1,k)] + A0[_A0_Index(i,j - 1,k - 2)]); Anext[_Anext_Index(i + 1,j - 1,k - 1)] = fac * A0[_A0_Index(i + 1,j - 1,k - 1)] + fac2 * (A0[_A0_Index(i + 2,j - 1,k - 1)] + A0[_A0_Index(i,j - 1,k - 1)] + A0[_A0_Index(i + 1,j,k - 1)] + A0[_A0_Index(i + 1,j - 2,k - 1)] + A0[_A0_Index(i + 1,j - 1,k)] + A0[_A0_Index(i + 1,j - 1,k - 2)]); Anext[_Anext_Index(i + 2,j - 1,k - 1)] = fac * A0[_A0_Index(i + 2,j - 1,k - 1)] + fac2 * (A0[_A0_Index(i + 3,j - 1,k - 1)] + A0[_A0_Index(i + 1,j - 1,k - 1)] + A0[_A0_Index(i + 2,j,k - 1)] + A0[_A0_Index(i + 2,j - 2,k - 1)] + A0[_A0_Index(i + 2,j - 1,k)] + A0[_A0_Index(i + 2,j - 1,k - 2)]); ~70 similar lines } 6

7 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB Phase 1: Python AST => Domain AST def kernel(self, in_grid, out_grid): for x in out_grid.interior_points(): for y in in_grid.neighbors(x, 1): out_grid[x] = out_grid[x] + in_grid[y] def visit_for(self, node): if (node.isFunctionCall("interior_points")): grid = self.visit(node.iter.func.value).id body = map(self.visit, node.body) target = self.visit(node.target) newnode = StencilKernel.StencilInterior(grid, body, target) return newnode … Visit each AST node of Python call to specializer (available via Python’s introspection) Convert calls to interior_points and neighbors to domain- specific, target-independent AST nodes for later processing Note use of Python in manipulating/traversing AST

8 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB AST phase 1 8 Call interior_point s For x iter Call neighbors(x,1) For y iter := out_grid[x]+ in_grid[y] StencilInterior out_grid, x StencilNeighbor in_grid, y, 1 := out_grid[x]+ in_grid[y] Generic=>Domain-Specific for x in out_grid.interior_points for y in in_grid.neighbors out_grid[x] = out_grid[x] + in_grid[y]

9 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB out_grid[x] = out_grid[x] + in_grid[y] AST phase 2: Optimize DAST 9 For ( i over GridSize) := + Initialize variables … …… := +… …… +… …… +… …… Unrolled Neighbor Loop For ( j over GridSize) StencilInterior out_grid, x StencilNeighbor in_grid, y, 1 := out_grid[x] + in_grid[y] LoopUnroller().unroll(node, factor)

10 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB AST phase 3 & 4: bind platform, generate/compile/cache code  Transform to platform-specific AST (PAST)  optimizations can be reused across "similar" platforms (eg, SIMDize generically, let compiler map onto ISA) 10 For ( i over GridSize) := Initialize variables := Unrolled Neighbor Loop For ( j over GridSize) CilkFor(...) := Initialize variables := block scope {... } CilkFor(...)

11 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB Who writes what Application Specializer ASP core CodePy Kernel Python AST Target AST ASP Module Utilities Compiled module Kernel call Input data Kernel call Cache hit App author (PLL) Specializer author (ELL) SEJITS team 3 rd party library

12 EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB Results summary CA matrix powersStencilGMM Performance3x–9x faster than optimized C++ 53% of achievable peak mem BW Variant selection outperforms hand- coded CUDA on large problems PortabilityOpenMP; CILK + soon OpenMP, CILK+CUDA, CILK+ specializer LOC (logic + templates) app LOC reductionN/A~20x~15x SEJITS benefitDelivery vehicle for first autotuned CA- Akx Inlining functionsStrategy selection hides hetero- geneous HW App integrationLinear solversCAT scan analysisSpeech recognition 12


Download ppt "EECS Electrical Engineering and Computer Sciences B ERKELEY P AR L AB.py OS/HW f() h() Specializer.c PLL Interp Productivity app.so cc/ld $ $ SEJITS logical."

Similar presentations


Ads by Google