pan/va: Unify flow control

Group together dependency waits and flow control into a single enum. This
simplifies the code, clarifies some detail, and ensures consistency moving
forward.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15223>
This commit is contained in:
Alyssa Rosenzweig 2022-03-24 18:08:27 -04:00 committed by Marge Bot
parent cf6d1a81f6
commit 76487c7eb4
4 changed files with 48 additions and 89 deletions

View file

@ -64,34 +64,41 @@
<constant desc="Half-float $\pi$">0x42480000</constant>
</lut>
<enum name="Action">
<enum name="Flow">
<desc>
Every Valhall instruction can perform an action, like wait on dependency
slots. A few special actions are available, specified in the instruction
metadata from this enum. The `wait0126` action is required to wait on
Every Valhall instruction can wait on dependency
slots. A few special flows are available, specified in the instruction
metadata from this enum. The `wait0126` flow is required to wait on
dependency slot #6 and should be set on the instruction immediately
preceding `ATEST`. The `barrier` action may be set on any instruction for
subgroup barriers, and should particularly be set with the `BARRIER`
instruction for global barriers. The `td` action only applies to fragment
shaders and is used to terminate helper invocations, it should be set as
early as possible after helper invocations are no longer needed as
determined by data flow analysis. The `return` action is used to terminate
the shader, although it may be overloaded by the `BLEND` instruction.
preceding `ATEST`. The `wait` flow should be set for barriers.
The `discard` flow only applies to fragment shaders and is used to
terminate helper invocations, it should be set as early as possible after
helper invocations are no longer needed as determined by data flow
analysis. The `end` flow is used to terminate the shader, although it
may be overloaded by the `BLEND` instruction.
The `reconverge` action is required on any instruction immediately
The `reconverge` flow is required on any instruction immediately
preceding a possible change to the mask of active threads in a subgroup.
This includes all divergent branches, but it also includes the final
instruction at the end of any basic block where the immediate successor
(fallthrough) is the target of a divergent branch.
</desc>
<value name="Wait on all dependency slots">wait0126</value>
<value name="Subgroup barrier">barrier</value>
<value name="None" default="true">none</value>
<value name="Wait on slot 0">wait0</value>
<value name="Wait on slot 1">wait1</value>
<value name="Wait on slots 0, 1">wait01</value>
<value name="Wait on slot 2">wait2</value>
<value name="Wait on slots 0, 2">wait02</value>
<value name="Wait on slots 1, 2">wait12</value>
<value name="Wait on slots 0, 1, 2">wait012</value>
<value name="Wait on slots 0, 1, 2, 6">wait0126</value>
<value name="Wait on slots 0, 1, 2, 6, 7">wait</value>
<value name="Perform branch reconverge">reconverge</value>
<reserved/>
<reserved/>
<value name="Terminate discarded threads">td</value>
<value name="Terminate discarded threads">discard</value>
<reserved/>
<value name="Return from shader">return</value>
<value name="Return from shader">end</value>
</enum>
<enum name="FAU special page 0">
@ -799,7 +806,7 @@
<ins name="BARRIER" title="Execution and memory barrier" opcode="0x45" unit="NONE">
<desc>
General-purpose barrier. Must use slot #7. Must be paired with a
`.barrier` action on the instruction.
`.wait` flow on the instruction.
</desc>
<slot/>
</ins>

View file

@ -339,35 +339,15 @@ def parse_asm(line):
encoded |= (fau.page << 57)
# Encode modifiers
has_action = False
has_flow = False
for mod in mods:
if len(mod) == 0:
continue
if mod in enums['action'].bare_values:
die_if(has_action, "Multiple actions specified")
has_action = True
encoded |= (enums['action'].bare_values.index(mod) << 59)
encoded |= (1 << 62) # Action, not wait
elif mod.startswith('wait'):
die_if(has_action, "Multiple actions specified")
has_action = True
slots = mod[len('wait'):]
try:
slots = set([int(x) for x in slots])
except ValueError:
die(f"Expected slots in {mod}")
known_slots = set([0, 1, 2])
die_if(not slots.issubset(known_slots), f"Unknown slots in {mod}")
if 0 in slots:
encoded |= (1 << 59)
if 1 in slots:
encoded |= (1 << 60)
if 2 in slots:
encoded |= (1 << 61)
if mod in enums['flow'].bare_values:
die_if(has_flow, "Multiple flow control modifiers specified")
has_flow = True
encoded |= (enums['flow'].bare_values.index(mod) << 59)
else:
candidates = [c for c in ins.modifiers if mod in c.bare_values]

View file

@ -42,34 +42,6 @@ static const uint32_t va_immediates[32] = {
% endfor
};
/* Byte 7 has instruction metadata, analogous to Bifrost's clause header */
struct va_metadata {
bool opcode_high : 1;
unsigned immediate_mode : 2;
unsigned action : 3;
bool do_action : 1;
bool unk3 : 1;
} __attribute__((packed));
static inline void
va_print_metadata(FILE *fp, uint8_t meta)
{
struct va_metadata m;
memcpy(&m, &meta, 1);
if (m.do_action) {
fputs(valhall_action[m.action], fp);
} else if (m.action) {
fprintf(fp, ".wait%s%s%s",
m.action & (1 << 0) ? "0" : "",
m.action & (1 << 1) ? "1" : "",
m.action & (1 << 2) ? "2" : "");
}
if (m.unk3)
fprintf(fp, ".unk3");
}
static inline void
va_print_src(FILE *fp, uint8_t src, unsigned fau_page)
{
@ -153,8 +125,8 @@ va_disasm_instr(FILE *fp, uint64_t instr)
% endif
% endif
% endfor
va_print_metadata(fp, instr >> 56);
fputs(" ", fp);
assert((instr & (1ull << 63)) == 0 /* reserved */);
fprintf(fp, "%s ", valhall_flow[instr >> 59]);
% if len(op.dests) > 0:
<% no_comma = False %>
va_print_dest(fp, (instr >> 40), true);

View file

@ -28,8 +28,8 @@ e6 00 00 00 00 c1 91 06 MOV.i32 r1, core_id.w0
82 3c 27 20 00 c0 a3 01 SHADDX.u64 r0, u2, r60.w0, shift:0x2
40 00 00 18 82 80 60 08 LOAD.i32.unsigned.slot0.wait0 @r0, `r0, offset:0
80 7c 47 20 00 c0 a3 01 SHADDX.u64 r0, u0, `r60.w0, shift:0x4
40 00 00 38 08 44 61 78 STORE.i128.slot0.return @r4:r5:r6:r7, `r0, offset:0
00 00 00 00 00 c0 00 78 NOP.return
40 00 00 38 08 44 61 78 STORE.i128.slot0.end @r4:r5:r6:r7, `r0, offset:0
00 00 00 00 00 c0 00 78 NOP.end
40 c4 c0 9c 01 c1 f0 00 ICMP.u32.gt.m1 r1, `r0, 0x1000000.b3, 0x0
42 00 00 18 02 40 61 50 STORE.i32.slot0.reconverge @r0, `r2, offset:0
00 c9 8f 12 30 c0 a0 00 CLPER.i32.f1 r0, r0, 0x7060504.b0
@ -46,18 +46,18 @@ e6 00 00 00 00 c1 91 06 MOV.i32 r1, core_id.w0
40 00 0b 10 00 c3 90 00 F16_TO_F32 r3, `r0.h1
00 00 00 00 00 c0 00 40 NOP.wait0126
42 43 04 00 00 c0 a5 00 V2F32_TO_V2F16 r0, `r2, `r3
40 c0 00 28 90 c0 a5 48 FADD.v2f16.barrier r0, `r0.abs, 0x0.neg
40 c0 00 28 90 c0 a5 48 FADD.v2f16.wait r0, `r0.abs, 0x0.neg
c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0
3c d0 ea 00 02 bc 7d 68 ATEST.td @r60, r60, 0x3F800000, atest_datum.w0
3c d0 ea 00 02 bc 7d 68 ATEST.discard @r60, r60, 0x3F800000, atest_datum.w0
40 db 05 04 00 c1 a1 00 MKVEC.v2i16 r1, `r0.h00, 0x3C000000.h10
f0 00 3c 33 04 40 7f 78 BLEND.slot0.v4.f16.return @r0:r1, blend_descriptor_0.w0, r60, target:0x0
f0 00 3c 33 04 40 7f 78 BLEND.slot0.v4.f16.end @r0:r1, blend_descriptor_0.w0, r60, target:0x0
7b 0d 00 40 04 84 5e 08 LEA_BUF_IMM.slot1.wait0 @r4:r5, `r59, table:0xD, index:0x0
00 dd c0 08 14 c2 b2 00 FMA.f32 r2, r0, 0x44000000.neg.h1, 0x0.neg
41 88 c0 00 04 c1 b2 00 FMA.f32 r1, `r1, u8, 0x0.neg
40 88 c0 00 04 c0 b2 10 FMA.f32.wait1 r0, `r0, u8, 0x0.neg
44 00 00 32 06 40 61 78 STORE.i96.estream.slot0.return @r0:r1:r2, `r4, offset:0
44 00 00 39 08 48 61 78 STORE.i128.istream.slot0.return @r8:r9:r10:r11, `r4, offset:0
00 00 00 c0 01 c0 45 48 BARRIER.slot7.barrier
44 00 00 32 06 40 61 78 STORE.i96.estream.slot0.end @r0:r1:r2, `r4, offset:0
44 00 00 39 08 48 61 78 STORE.i128.istream.slot0.end @r8:r9:r10:r11, `r4, offset:0
00 00 00 c0 01 c0 45 48 BARRIER.slot7.wait
80 00 00 00 82 82 60 00 LOAD.i8.unsigned.slot0 @r2, u0, offset:0
80 00 00 08 82 82 60 00 LOAD.i16.unsigned.slot0 @r2, u0, offset:0
80 00 00 10 82 82 60 00 LOAD.i24.unsigned.slot0 @r2, u0, offset:0
@ -106,9 +106,9 @@ c0 01 00 00 00 c4 10 51 IADD_IMM.i32.reconverge r4, 0x0, #0x1
42 00 00 38 08 44 61 00 STORE.i128.slot0 @r4:r5:r6:r7, `r2, offset:0
41 f8 ff ff 07 c0 1f 50 BRANCHZ.reconverge `r1, offset:-8
7d c0 00 08 10 bc a1 00 IADD.v2u16 r60.h1, `r61.h10, 0x0
44 00 46 32 28 40 71 78 ST_CVT.slot0.istream.v4.f32.return @r0:r1:r2:r3, `r4, `r6, offset:0x0
44 00 46 34 28 40 71 78 ST_CVT.slot0.istream.v4.s32.return @r0:r1:r2:r3, `r4, `r6, offset:0x0
44 00 46 36 28 40 71 78 ST_CVT.slot0.istream.v4.u32.return @r0:r1:r2:r3, `r4, `r6, offset:0x0
44 00 46 32 28 40 71 78 ST_CVT.slot0.istream.v4.f32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0
44 00 46 34 28 40 71 78 ST_CVT.slot0.istream.v4.s32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0
44 00 46 36 28 40 71 78 ST_CVT.slot0.istream.v4.u32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0
7c c0 12 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, `r60, 0x0, table:0x2, index:0x1
7c c0 02 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, `r60, 0x0, table:0x2, index:0x0
82 81 00 28 f4 82 6a 00 LD_BUFFER.i64.unsigned.slot0 @r2:r3, u2, u1
@ -123,7 +123,7 @@ c0 01 00 00 00 c4 10 51 IADD_IMM.i32.reconverge r4, 0x0, #0x1
40 44 80 00 01 c0 b8 00 MUX.i32 r0, `r0, `r4, u0
40 44 80 00 02 c0 b8 00 MUX.i32.fp_zero r0, `r0, `r4, u0
40 44 80 00 03 c0 b8 00 MUX.i32.bit r0, `r0, `r4, u0
00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.td r1, r0
00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.discard r1, r0
01 00 02 00 00 c2 9c 00 FRSQ.f32 r2, r1
40 00 02 01 00 c0 99 00 FREXPE.f32.sqrt r0, `r0
41 42 c0 40 04 c0 62 41 FMA_RSCALE_LEFT.f32.wait0126 r0, `r1, `r2, 0x0.neg, `r0
@ -180,14 +180,14 @@ c0 77 01 0c 00 c2 a8 00 ISUB.s32 r2, 0x0, `r55.h1
00 00 03 00 20 c1 90 00 V2S8_TO_V2F16 r1, r0.b20
40 00 03 00 60 c0 90 00 V2S8_TO_V2F16 r0, `r0.b21
3d 00 00 b2 88 80 5c 68 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.td @r0:r1:r2:r3, r61, index:0x0
3d 00 00 b2 88 80 5c 68 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.discard @r0:r1:r2:r3, r61, index:0x0
3d 00 10 72 18 84 5c 00 LD_VAR_BUF_IMM.f32.slot1.v4.src_f32.center.retrieve @r4:r5:r6:r7, r61, index:0x10
c0 00 00 00 00 c8 10 01 IADD_IMM.i32 r8, 0x0, #0x0
c0 00 00 00 00 c9 10 01 IADD_IMM.i32 r9, 0x0, #0x0
3d 00 14 00 00 ca 90 00 U16_TO_U32 r10, r61.h00
3d 09 00 00 30 c0 1f 50 BRANCHZ.eq.reconverge r61.h0, offset:9
0a 00 00 00 00 cb 91 50 MOV.i32.reconverge r11, r10
00 00 00 00 00 c0 00 48 NOP.barrier
00 00 00 00 00 c0 00 48 NOP.wait
81 0b 80 33 04 8e 78 00 LD_TILE.v4.f16.slot0 @r14:r15, u1, r11, u0
0b 00 04 00 00 cc 91 00 CLZ.u32 r12, r11
82 4c c0 52 00 cc b4 00 RSHIFT_XOR.i32.not_result r12, u2, `r12.b00, 0x0
@ -202,15 +202,15 @@ c0 00 00 00 00 c9 10 01 IADD_IMM.i32 r9, 0x0, #0x0
49 3e c0 22 04 c9 b3 30 FMA.v2f16.wait12 r9, `r9, r62.h00, 0x0.neg
47 43 00 00 00 c3 a4 00 FADD.f32 r3, `r7, `r3
43 09 00 08 00 c3 a4 40 FADD.f32.wait0126 r3, `r3, r9.h1
3c 03 ea 00 02 bc 7d 68 ATEST.td @r60, r60, r3, atest_datum.w0
3c 03 ea 00 02 bc 7d 68 ATEST.discard @r60, r60, r3, atest_datum.w0
46 42 00 00 00 c2 a4 00 FADD.f32 r2, `r6, `r2
44 40 00 00 00 c0 a4 00 FADD.f32 r0, `r4, `r0
48 7e c0 22 04 ff b3 00 FMA.v2f16 r63, `r8, `r62.h00, 0x0.neg
45 41 00 00 00 c1 a4 00 FADD.f32 r1, `r5, `r1
41 3f 00 08 00 c1 a4 00 FADD.f32 r1, `r1, r63.h1
40 7f 00 04 00 c0 a4 00 FADD.f32 r0, `r0, `r63.h0
42 49 00 04 00 c2 a4 48 FADD.f32.barrier r2, `r2, `r9.h0
f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.return @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
42 49 00 04 00 c2 a4 48 FADD.f32.wait r2, `r2, `r9.h0
f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0
c0 f1 00 00 10 c1 2f 08 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1
80 00 c0 17 34 7c 25 01 TEX_FETCH.slot0.f.32.2d @r0:r1:r2:r3, @r60:r61, u0