mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 22:20:14 +01:00
i965/fs: Add empirically-determined instruction latencies for gen7.
v2: Actually switch on the other math instructions mentioned in the
comment.
v3: Add timing data for textureSize(), and clean up some long comment
lines.
Testing shader_time of fs16 shaders on a few frames of various apps:
nexuiz improved by 2.9% +/- 1.5% (n=10)
no difference on GLB2.5 (n=36, outliers removed)
no difference on GLB2.7 (n=25)
etqw improved by 2.6% +/- 2.2% (n=25)
no difference on lightsmark (n=25)
Acked-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
parent
4df1e18864
commit
2cae9f2d4a
1 changed files with 179 additions and 3 deletions
|
|
@ -57,7 +57,7 @@ static bool debug = false;
|
|||
class schedule_node : public exec_node
|
||||
{
|
||||
public:
|
||||
schedule_node(fs_inst *inst)
|
||||
schedule_node(fs_inst *inst, int gen)
|
||||
{
|
||||
this->inst = inst;
|
||||
this->child_array_size = 0;
|
||||
|
|
@ -67,10 +67,14 @@ public:
|
|||
this->parent_count = 0;
|
||||
this->unblocked_time = 0;
|
||||
|
||||
set_latency_gen4();
|
||||
if (gen >= 7)
|
||||
set_latency_gen7();
|
||||
else
|
||||
set_latency_gen4();
|
||||
}
|
||||
|
||||
void set_latency_gen4();
|
||||
void set_latency_gen7();
|
||||
|
||||
fs_inst *inst;
|
||||
schedule_node **children;
|
||||
|
|
@ -120,6 +124,178 @@ schedule_node::set_latency_gen4()
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
schedule_node::set_latency_gen7()
|
||||
{
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_MAD:
|
||||
/* 3 cycles (this is said to be 4 cycles sometimes depending on the
|
||||
* register numbers in the sources):
|
||||
* mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
|
||||
*
|
||||
* 20 cycles:
|
||||
* mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
|
||||
* mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
|
||||
*/
|
||||
latency = 17;
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_RCP:
|
||||
case SHADER_OPCODE_RSQ:
|
||||
case SHADER_OPCODE_SQRT:
|
||||
case SHADER_OPCODE_LOG2:
|
||||
case SHADER_OPCODE_EXP2:
|
||||
case SHADER_OPCODE_SIN:
|
||||
case SHADER_OPCODE_COS:
|
||||
/* 2 cycles:
|
||||
* math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
|
||||
*
|
||||
* 18 cycles:
|
||||
* math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
*
|
||||
* Same for exp2, log2, rsq, sqrt, sin, cos.
|
||||
*/
|
||||
latency = 16;
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_POW:
|
||||
/* 2 cycles:
|
||||
* math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
|
||||
*
|
||||
* 26 cycles:
|
||||
* math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
*/
|
||||
latency = 24;
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_TEX:
|
||||
case SHADER_OPCODE_TXD:
|
||||
case SHADER_OPCODE_TXF:
|
||||
case SHADER_OPCODE_TXL:
|
||||
/* 18 cycles:
|
||||
* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
|
||||
* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
|
||||
* send(8) g4<1>UW g114<8,8,1>F
|
||||
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
|
||||
*
|
||||
* 697 +/-49 cycles (min 610, n=26):
|
||||
* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
|
||||
* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
|
||||
* send(8) g4<1>UW g114<8,8,1>F
|
||||
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
*
|
||||
* So the latency on our first texture load of the batchbuffer takes
|
||||
* ~700 cycles, since the caches are cold at that point.
|
||||
*
|
||||
* 840 +/- 92 cycles (min 720, n=25):
|
||||
* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
|
||||
* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
|
||||
* send(8) g4<1>UW g114<8,8,1>F
|
||||
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
* send(8) g4<1>UW g114<8,8,1>F
|
||||
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
*
|
||||
* On the second load, it takes just an extra ~140 cycles, and after
|
||||
* accounting for the 14 cycles of the MOV's latency, that makes ~130.
|
||||
*
|
||||
* 683 +/- 49 cycles (min = 602, n=47):
|
||||
* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
|
||||
* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
|
||||
* send(8) g4<1>UW g114<8,8,1>F
|
||||
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
|
||||
* send(8) g50<1>UW g114<8,8,1>F
|
||||
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
*
|
||||
* The unit appears to be pipelined, since this matches up with the
|
||||
* cache-cold case, despite there being two loads here. If you replace
|
||||
* the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
|
||||
*
|
||||
* So, take some number between the cache-hot 140 cycles and the
|
||||
* cache-cold 700 cycles. No particular tuning was done on this.
|
||||
*
|
||||
* I haven't done significant testing of the non-TEX opcodes. TXL at
|
||||
* least looked about the same as TEX.
|
||||
*/
|
||||
latency = 200;
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_TXS:
|
||||
/* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
|
||||
* cycles (n=15):
|
||||
* mov(8) g114<1>UD 0D { align1 WE_normal 1Q };
|
||||
* send(8) g6<1>UW g114<8,8,1>F
|
||||
* sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q };
|
||||
* mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q };
|
||||
*
|
||||
*
|
||||
* Two loads was 535 +/- 30 cycles (n=19):
|
||||
* mov(16) g114<1>UD 0D { align1 WE_normal 1H };
|
||||
* send(16) g6<1>UW g114<8,8,1>F
|
||||
* sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
|
||||
* mov(16) g114<1>UD 0D { align1 WE_normal 1H };
|
||||
* mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H };
|
||||
* send(16) g8<1>UW g114<8,8,1>F
|
||||
* sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
|
||||
* mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H };
|
||||
* add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H };
|
||||
*
|
||||
* Since the only caches that should matter are just the
|
||||
* instruction/state cache containing the surface state, assume that we
|
||||
* always have hot caches.
|
||||
*/
|
||||
latency = 100;
|
||||
break;
|
||||
|
||||
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
|
||||
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
||||
/* testing using varying-index pull constants:
|
||||
*
|
||||
* 16 cycles:
|
||||
* mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
|
||||
* send(8) g4<1>F g4<8,8,1>D
|
||||
* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
|
||||
*
|
||||
* ~480 cycles:
|
||||
* mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
|
||||
* send(8) g4<1>F g4<8,8,1>D
|
||||
* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
*
|
||||
* ~620 cycles:
|
||||
* mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
|
||||
* send(8) g4<1>F g4<8,8,1>D
|
||||
* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
* send(8) g4<1>F g4<8,8,1>D
|
||||
* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
*
|
||||
* So, if it's cache-hot, it's about 140. If it's cache cold, it's
|
||||
* about 460. We expect to mostly be cache hot, so pick something more
|
||||
* in that direction.
|
||||
*/
|
||||
latency = 200;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* 2 cycles:
|
||||
* mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
|
||||
*
|
||||
* 16 cycles:
|
||||
* mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
|
||||
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
|
||||
*/
|
||||
latency = 14;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
class instruction_scheduler {
|
||||
public:
|
||||
instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count,
|
||||
|
|
@ -159,7 +335,7 @@ public:
|
|||
void
|
||||
instruction_scheduler::add_inst(fs_inst *inst)
|
||||
{
|
||||
schedule_node *n = new(mem_ctx) schedule_node(inst);
|
||||
schedule_node *n = new(mem_ctx) schedule_node(inst, v->intel->gen);
|
||||
|
||||
assert(!inst->is_head_sentinel());
|
||||
assert(!inst->is_tail_sentinel());
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue