Presentation is loading. Please wait.

Presentation is loading. Please wait.

Lockstep execution.

Similar presentations


Presentation on theme: "Lockstep execution."— Presentation transcript:

1 Lockstep execution

2 anotherCheapFunction(); } else aMoreExpensiveFunction();
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction();

3 anotherCheapFunction(); } else aMoreExpensiveFunction();
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction();

4 anotherCheapFunction(); } else aMoreExpensiveFunction();
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction();

5 anotherCheapFunction(); } else aMoreExpensiveFunction();
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction();

6 anotherCheapFunction(); } else aMoreExpensiveFunction();
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction();

7 anotherCheapFunction(); } else aMoreExpensiveFunction();
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); if ( fastPath ) { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); …rest of the code… …rest of the code…

8 Scalarized execution

9 bool s_fastPath = WaveAllTrue(fastPath);
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction();

10 bool s_fastPath = WaveAllTrue(fastPath);
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction();

11 bool s_fastPath = WaveAllTrue(fastPath);
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction();

12 bool s_fastPath = WaveAllTrue(fastPath);
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction();

13 bool s_fastPath = WaveAllTrue(fastPath);
exec mask: …. 1 Thread 0 (fastpath is true) Thread 1 (fastPath is false) bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); bool s_fastPath = WaveAllTrue(fastPath); if ( s_fastPath ) // false across the whole wave { aCheapFunction(); anotherCheapFunction(); } else aMoreExpensiveFunction(); anotherExpensiveFunction(); …rest of the code… …rest of the code…

14 Wave vis

15 Wave Wave thread thread thread thread thread thread thread thread

16 Some intrinsics

17 Intrinsic Description uint WaveGetLaneIndex() Returns the index of the lane within the current lane (in a VGPR of course) uint4 WaveActiveBallot(bool) Returns a 64-bit mask containing the result of the passed predicate for all the active lanes. This mask will be in a SGPR. bool WaveActiveAnyTrue(bool) Probably using ballot, it returns whether the predicate passed is true for any active lane. Result is in SGPR. bool WaveActiveAllTrue(bool) Probably using ballot, it returns whether the predicate passed is true for all active lane. Result is in SGPR. <type> WaveReadLaneFirst(<type>) Returns the value of the passed expression for the first active lane in the wave. Result is in SGPR. <type> WaveActiveMin(<type>) Returns the minimum value of the passed expression across all active lanes in the wave. Result is in SGPR.

18 Latency hiding example

19 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code …

20 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code …

21 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code …

22 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code …

23 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code …

24 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code …

25 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code …

26 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); ….wait for result… float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code …

27 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); ….wait for result… float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code …

28 float4 b = b0 + b1; float4 c = a + b; float4 d = c * e;
float4 f = load_data(); ….wait for result… float4 g = f + h; … rest of code … float4 b = b0 + b1; float4 c = a + b; float4 d = c * e; float4 f = load_data(); float4 g = f + h; … rest of code … ready

29 Single wave overlap multiple tiles

30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15

31 Non-scalarized light loop

32 2 3 4 1 2 3 6 3 5 v_lightIdx = 0 v_lightIdx = 1 v_lightIdx = 0
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 0 v_lightIdx = 1 v_lightIdx = 0

33 2 3 4 1 2 3 6 3 5 v_lightIdx = 0 v_lightIdx = 1 v_lightIdx = 0
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 0 v_lightIdx = 1 v_lightIdx = 0

34 2 3 4 1 2 3 6 3 5 v_lightIdx = 0 v_lightIdx = 1 v_lightIdx = 0
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 0 v_lightIdx = 1 v_lightIdx = 0

35 2 3 4 1 2 3 6 3 5 v_lightIdx = 2 v_lightIdx = 2 v_lightIdx = 3
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 2 v_lightIdx = 2 v_lightIdx = 3

36 2 3 4 1 2 3 6 3 5 v_lightIdx = 2 v_lightIdx = 2 v_lightIdx = 3
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 2 v_lightIdx = 2 v_lightIdx = 3

37 2 3 4 1 2 3 6 3 5 v_lightIdx = 2 v_lightIdx = 2 v_lightIdx = 3
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 2 v_lightIdx = 2 v_lightIdx = 3

38 2 3 4 1 2 3 6 3 5 v_lightIdx = 3 v_lightIdx = 3 v_lightIdx = 5
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 3 v_lightIdx = 3 v_lightIdx = 5

39 2 3 4 1 2 3 6 3 5 v_lightIdx = 3 v_lightIdx = 3 v_lightIdx = 5
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 3 v_lightIdx = 3 v_lightIdx = 5

40 2 3 4 1 2 3 6 3 5 v_lightIdx = 3 v_lightIdx = 3 v_lightIdx = 5
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 3 v_lightIdx = 3 v_lightIdx = 5

41 2 3 4 1 2 3 6 3 5 v_lightIdx = 4 v_lightIdx = 6 v_lightIdx = 5
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 4 v_lightIdx = 6 v_lightIdx = 5

42 2 3 4 1 2 3 6 3 5 v_lightIdx = 4 v_lightIdx = 6 v_lightIdx = 5
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 4 v_lightIdx = 6 v_lightIdx = 5

43 2 3 4 1 2 3 6 3 5 v_lightIdx = 4 v_lightIdx = 6 v_lightIdx = 5
2 3 4 1 2 3 6 3 5 StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } StructuredBuffer<LightData> Lights; ByteAddressBuffer Indices; uint v_cellIdx = GetCellIdx(); {v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); for( i = 0; i < v_lightCount; ++i) { uint v_lightIdx = GetLightIdx(v_lightStart, i); LightData v_light = Lights[v_lightIdx]; ProcessLight(v_light); } v_lightIdx = 4 v_lightIdx = 6 v_lightIdx = 5

44 Scalarization approach #1 – Step by Step

45 lane ID v_cellIdx A C B 1 v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask exec mask 1 s_cellIdx

46 lane ID v_cellIdx A C B 1 A v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask exec mask 1 s_cellIdx A

47 lane ID v_cellIdx A C B 1 1 A v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask 1 s_cellIdx A

48 lane ID v_cellIdx A C B 1 1 A v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask 1 s_cellIdx A

49 lane ID v_cellIdx A C B 1 1 A v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask 1 s_cellIdx A

50 lane ID v_cellIdx A C B 1 1 A v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask 1 s_cellIdx A

51 lane ID v_cellIdx A C B 1 C v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask exec mask 1 s_cellIdx C

52 lane ID v_cellIdx A C B 1 1 C v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask 1 s_cellIdx C

53 lane ID v_cellIdx A C B 1 1 C v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask 1 s_cellIdx C

54 lane ID v_cellIdx A C B 1 1 C v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask 1 s_cellIdx C

55 lane ID v_cellIdx A C B 1 1 C v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask 1 s_cellIdx C

56 A C B 1 B v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask exec mask 1 s_cellIdx B

57 A C B 1 1 B v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask 1 s_cellIdx B

58 A C B 1 B v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask s_cellIdx B

59 A C B 1 B v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask s_cellIdx B

60 A C B 1 B v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask 1 exec mask s_cellIdx B

61 A C B B v_cellIdx lane mask exec mask s_cellIdx
uint v_cellIdx = GetCellIdx(); uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } … rest of code … void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); v_cellIdx A C B lane mask exec mask s_cellIdx B

62 uint v_cellIdx = GetCellIdx();
uint v_laneID = WaveGetLaneIndex(); ulong execMask = 0xffffffff; ulong curLaneMask = ulong(1) << ulong(v_laneID); while ( ( execMask & curLaneMask ) != 0 ) { uint s_cellIdx = WaveReadFirstLane(v_cellIdx); ulong laneMask = WaveBallot( v_cellIdx == s_cellIdx ); execMask = execMask & ~laneMask; if (v_cellIdx == s_cellIdx ) ProcessLightsInCell(s_cellIdx); } … rest of code … void ProcessLightsInCell(uint s_cellIdx) {s_lightStart, s_lightCount} = GetCellIndices(s_cellIdx); for( i = 0; i < s_lightCount; ++i) uint s_lightIdx = GetLightIdx(s_lightStart, i); LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light);

63 Multiple evaluations with approach #1

64 Cell Index Cell Content A 42 44 45 46 B 41 C 40 Light index Number of code evals 40 41 42 44 45 46

65 Cell Index Cell Content A 42 44 45 46 B 41 C 40 Light index Number of code evals 40 41 42 1 44 45 46 Evaluating cell A

66 Cell Index Cell Content A 42 44 45 46 B 41 C 40 Light index Number of code evals 40 41 1 42 2 44 45 46 Evaluating cell B

67 Cell Index Cell Content A 42 44 45 46 B 41 C 40 Light index Number of code evals 40 1 41 2 42 3 44 45 46 Evaluating cell C

68 Scalarization approach #2 – Step by Step

69 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx v_lightIdx v_cellIdx A C B lightOffset Cells content: A 42 44 45 46 B 41 C 40

70 A C B A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx v_lightIdx v_cellIdx A C B lightOffset A 42 44 45 46 B 41 C 40

71 A C B A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx v_lightIdx 42 40 41 v_cellIdx A C B lightOffset A 42 44 45 46 B 41 C 40

72 40 A C B A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 40 v_lightIdx 42 40 41 v_cellIdx A C B lightOffset A 42 44 45 46 B 41 C 40

73 40 A C B A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 40 v_lightIdx 42 40 41 v_cellIdx A C B lightOffset A 42 44 45 46 B 41 C 40

74 40 A C B 1 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 40 v_lightIdx 42 40 41 v_cellIdx A C B lightOffset 1 A 42 44 45 46 B 41 C 40

75 40 A C B 1 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 40 v_lightIdx 42 40 41 v_cellIdx A C B lightOffset 1 A 42 44 45 46 B 41 C 40

76 A C B 1 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx v_lightIdx 42 41 v_cellIdx A C B lightOffset 1 A 42 44 45 46 B 41 C 40

77 41 A C B 1 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 41 v_lightIdx 42 41 v_cellIdx A C B lightOffset 1 A 42 44 45 46 B 41 C 40

78 41 A C B 2 1 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 41 v_lightIdx 42 41 v_cellIdx A C B lightOffset 2 1 A 42 44 45 46 B 41 C 40

79 41 A C B 2 1 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 41 v_lightIdx 42 41 v_cellIdx A C B lightOffset 2 1 A 42 44 45 46 B 41 C 40

80 A C B 2 1 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx v_lightIdx 42 v_cellIdx A C B lightOffset 2 1 A 42 44 45 46 B 41 C 40

81 42 A C B 2 1 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 42 v_lightIdx 42 v_cellIdx A C B lightOffset 2 1 A 42 44 45 46 B 41 C 40

82 42 A C B 2 1 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 42 v_lightIdx 42 v_cellIdx A C B lightOffset 2 1 A 42 44 45 46 B 41 C 40

83 42 A C B 1 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 42 v_lightIdx 42 v_cellIdx A C B lightOffset 1 3 2 A 42 44 45 46 B 41 C 40

84 42 A C B 1 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 42 v_lightIdx 42 v_cellIdx A C B lightOffset 1 3 2 A 42 44 45 46 B 41 C 40

85 A C B 1 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx v_lightIdx 44 42 46 v_cellIdx A C B lightOffset 1 3 2 A 42 44 45 46 B 41 C 40

86 44 A C B 1 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 44 v_lightIdx 44 42 46 v_cellIdx A C B lightOffset 1 3 2 A 42 44 45 46 B 41 C 40

87 44 A C B 1 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 44 v_lightIdx 44 42 46 v_cellIdx A C B lightOffset 1 3 2 A 42 44 45 46 B 41 C 40

88 44 A C B 2 3 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 44 v_lightIdx 44 42 46 v_cellIdx A C B lightOffset 2 3 A 42 44 45 46 B 41 C 40

89 44 A C B 2 3 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 44 v_lightIdx 44 42 46 v_cellIdx A C B lightOffset 2 3 A 42 44 45 46 B 41 C 40

90 A C B 2 3 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx v_lightIdx 45 42 46 v_cellIdx A C B lightOffset 2 3 A 42 44 45 46 B 41 C 40

91 45 A C B 2 3 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 45 v_lightIdx 45 42 46 v_cellIdx A C B lightOffset 2 3 A 42 44 45 46 B 41 C 40

92 45 A C B 2 3 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 45 v_lightIdx 45 42 46 v_cellIdx A C B lightOffset 2 3 A 42 44 45 46 B 41 C 40

93 45 A C B 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 45 v_lightIdx 45 42 46 v_cellIdx A C B lightOffset 3 2 A 42 44 45 46 B 41 C 40

94 45 A C B 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 45 v_lightIdx 45 42 46 v_cellIdx A C B lightOffset 3 2 A 42 44 45 46 B 41 C 40

95 A C B 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx v_lightIdx 46 42 v_cellIdx A C B lightOffset 3 2 A 42 44 45 46 B 41 C 40

96 46 A C B 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 46 v_lightIdx 46 42 v_cellIdx A C B lightOffset 3 2 A 42 44 45 46 B 41 C 40

97 46 A C B 3 2 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 46 v_lightIdx 46 42 v_cellIdx A C B lightOffset 3 2 A 42 44 45 46 B 41 C 40

98 46 A C B 4 3 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 46 v_lightIdx 46 42 v_cellIdx A C B lightOffset 4 3 A 42 44 45 46 B 41 C 40

99 46 A C B 4 3 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx 46 v_lightIdx 46 42 v_cellIdx A C B lightOffset 4 3 A 42 44 45 46 B 41 C 40

100 A C B 4 3 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } s_lightIdx v_lightIdx 46 42 v_cellIdx A C B lightOffset 4 3 A 42 44 45 46 B 41 C 40

101 A 42 44 45 46 B 41 C 40 uint v_cellIdx = GetCellIdx();
{v_lightStart, v_lightCount} = GetCellIndices(v_cellIdx); uint lightOffset = 0; while(lightOffset < lightCount ) { uint v_lightIdx = GetLightIdx(v_lightStart, lightOffset); uint s_lightIdx = WaveActiveMin(v_lightIdx); if(s_lightIdx == v_lightIdx) v_lightOffset++; LightData s_light = Lights[s_lightIdx]; ProcessLight(s_light); } … rest of the code … A 42 44 45 46 B 41 C 40


Download ppt "Lockstep execution."

Similar presentations


Ads by Google