From 76487c7eb48dedf973037d5285a09bd6ca9fe349 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Thu, 24 Mar 2022 18:08:27 -0400 Subject: [PATCH] pan/va: Unify flow control Group together dependency waits and flow control into a single enum. This simplifies the code, clarifies some detail, and ensures consistency moving forward. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/valhall/ISA.xml | 41 +++++++++++-------- src/panfrost/bifrost/valhall/asm.py | 30 +++----------- src/panfrost/bifrost/valhall/disasm.py | 32 +-------------- .../bifrost/valhall/test/assembler-cases.txt | 34 +++++++-------- 4 files changed, 48 insertions(+), 89 deletions(-) diff --git a/src/panfrost/bifrost/valhall/ISA.xml b/src/panfrost/bifrost/valhall/ISA.xml index c645f7a8ea1..e276d6311f5 100644 --- a/src/panfrost/bifrost/valhall/ISA.xml +++ b/src/panfrost/bifrost/valhall/ISA.xml @@ -64,34 +64,41 @@ 0x42480000 - + - Every Valhall instruction can perform an action, like wait on dependency - slots. A few special actions are available, specified in the instruction - metadata from this enum. The `wait0126` action is required to wait on + Every Valhall instruction can wait on dependency + slots. A few special flows are available, specified in the instruction + metadata from this enum. The `wait0126` flow is required to wait on dependency slot #6 and should be set on the instruction immediately - preceding `ATEST`. The `barrier` action may be set on any instruction for - subgroup barriers, and should particularly be set with the `BARRIER` - instruction for global barriers. The `td` action only applies to fragment - shaders and is used to terminate helper invocations, it should be set as - early as possible after helper invocations are no longer needed as - determined by data flow analysis. The `return` action is used to terminate - the shader, although it may be overloaded by the `BLEND` instruction. + preceding `ATEST`. The `wait` flow should be set for barriers. + The `discard` flow only applies to fragment shaders and is used to + terminate helper invocations, it should be set as early as possible after + helper invocations are no longer needed as determined by data flow + analysis. The `end` flow is used to terminate the shader, although it + may be overloaded by the `BLEND` instruction. - The `reconverge` action is required on any instruction immediately + The `reconverge` flow is required on any instruction immediately preceding a possible change to the mask of active threads in a subgroup. This includes all divergent branches, but it also includes the final instruction at the end of any basic block where the immediate successor (fallthrough) is the target of a divergent branch. - wait0126 - barrier + none + wait0 + wait1 + wait01 + wait2 + wait02 + wait12 + wait012 + wait0126 + wait reconverge - td + discard - return + end @@ -799,7 +806,7 @@ General-purpose barrier. Must use slot #7. Must be paired with a - `.barrier` action on the instruction. + `.wait` flow on the instruction. diff --git a/src/panfrost/bifrost/valhall/asm.py b/src/panfrost/bifrost/valhall/asm.py index 34b81331dca..fbaedf83905 100644 --- a/src/panfrost/bifrost/valhall/asm.py +++ b/src/panfrost/bifrost/valhall/asm.py @@ -339,35 +339,15 @@ def parse_asm(line): encoded |= (fau.page << 57) # Encode modifiers - has_action = False + has_flow = False for mod in mods: if len(mod) == 0: continue - if mod in enums['action'].bare_values: - die_if(has_action, "Multiple actions specified") - has_action = True - encoded |= (enums['action'].bare_values.index(mod) << 59) - encoded |= (1 << 62) # Action, not wait - elif mod.startswith('wait'): - die_if(has_action, "Multiple actions specified") - has_action = True - - slots = mod[len('wait'):] - try: - slots = set([int(x) for x in slots]) - except ValueError: - die(f"Expected slots in {mod}") - - known_slots = set([0, 1, 2]) - die_if(not slots.issubset(known_slots), f"Unknown slots in {mod}") - - if 0 in slots: - encoded |= (1 << 59) - if 1 in slots: - encoded |= (1 << 60) - if 2 in slots: - encoded |= (1 << 61) + if mod in enums['flow'].bare_values: + die_if(has_flow, "Multiple flow control modifiers specified") + has_flow = True + encoded |= (enums['flow'].bare_values.index(mod) << 59) else: candidates = [c for c in ins.modifiers if mod in c.bare_values] diff --git a/src/panfrost/bifrost/valhall/disasm.py b/src/panfrost/bifrost/valhall/disasm.py index 30400b30564..30c98b4a94d 100644 --- a/src/panfrost/bifrost/valhall/disasm.py +++ b/src/panfrost/bifrost/valhall/disasm.py @@ -42,34 +42,6 @@ static const uint32_t va_immediates[32] = { % endfor }; -/* Byte 7 has instruction metadata, analogous to Bifrost's clause header */ -struct va_metadata { - bool opcode_high : 1; - unsigned immediate_mode : 2; - unsigned action : 3; - bool do_action : 1; - bool unk3 : 1; -} __attribute__((packed)); - -static inline void -va_print_metadata(FILE *fp, uint8_t meta) -{ - struct va_metadata m; - memcpy(&m, &meta, 1); - - if (m.do_action) { - fputs(valhall_action[m.action], fp); - } else if (m.action) { - fprintf(fp, ".wait%s%s%s", - m.action & (1 << 0) ? "0" : "", - m.action & (1 << 1) ? "1" : "", - m.action & (1 << 2) ? "2" : ""); - } - - if (m.unk3) - fprintf(fp, ".unk3"); -} - static inline void va_print_src(FILE *fp, uint8_t src, unsigned fau_page) { @@ -153,8 +125,8 @@ va_disasm_instr(FILE *fp, uint64_t instr) % endif % endif % endfor - va_print_metadata(fp, instr >> 56); - fputs(" ", fp); + assert((instr & (1ull << 63)) == 0 /* reserved */); + fprintf(fp, "%s ", valhall_flow[instr >> 59]); % if len(op.dests) > 0: <% no_comma = False %> va_print_dest(fp, (instr >> 40), true); diff --git a/src/panfrost/bifrost/valhall/test/assembler-cases.txt b/src/panfrost/bifrost/valhall/test/assembler-cases.txt index 2d82e370fdc..2aebbe2460e 100644 --- a/src/panfrost/bifrost/valhall/test/assembler-cases.txt +++ b/src/panfrost/bifrost/valhall/test/assembler-cases.txt @@ -28,8 +28,8 @@ e6 00 00 00 00 c1 91 06 MOV.i32 r1, core_id.w0 82 3c 27 20 00 c0 a3 01 SHADDX.u64 r0, u2, r60.w0, shift:0x2 40 00 00 18 82 80 60 08 LOAD.i32.unsigned.slot0.wait0 @r0, `r0, offset:0 80 7c 47 20 00 c0 a3 01 SHADDX.u64 r0, u0, `r60.w0, shift:0x4 -40 00 00 38 08 44 61 78 STORE.i128.slot0.return @r4:r5:r6:r7, `r0, offset:0 -00 00 00 00 00 c0 00 78 NOP.return +40 00 00 38 08 44 61 78 STORE.i128.slot0.end @r4:r5:r6:r7, `r0, offset:0 +00 00 00 00 00 c0 00 78 NOP.end 40 c4 c0 9c 01 c1 f0 00 ICMP.u32.gt.m1 r1, `r0, 0x1000000.b3, 0x0 42 00 00 18 02 40 61 50 STORE.i32.slot0.reconverge @r0, `r2, offset:0 00 c9 8f 12 30 c0 a0 00 CLPER.i32.f1 r0, r0, 0x7060504.b0 @@ -46,18 +46,18 @@ e6 00 00 00 00 c1 91 06 MOV.i32 r1, core_id.w0 40 00 0b 10 00 c3 90 00 F16_TO_F32 r3, `r0.h1 00 00 00 00 00 c0 00 40 NOP.wait0126 42 43 04 00 00 c0 a5 00 V2F32_TO_V2F16 r0, `r2, `r3 -40 c0 00 28 90 c0 a5 48 FADD.v2f16.barrier r0, `r0.abs, 0x0.neg +40 c0 00 28 90 c0 a5 48 FADD.v2f16.wait r0, `r0.abs, 0x0.neg c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0 -3c d0 ea 00 02 bc 7d 68 ATEST.td @r60, r60, 0x3F800000, atest_datum.w0 +3c d0 ea 00 02 bc 7d 68 ATEST.discard @r60, r60, 0x3F800000, atest_datum.w0 40 db 05 04 00 c1 a1 00 MKVEC.v2i16 r1, `r0.h00, 0x3C000000.h10 -f0 00 3c 33 04 40 7f 78 BLEND.slot0.v4.f16.return @r0:r1, blend_descriptor_0.w0, r60, target:0x0 +f0 00 3c 33 04 40 7f 78 BLEND.slot0.v4.f16.end @r0:r1, blend_descriptor_0.w0, r60, target:0x0 7b 0d 00 40 04 84 5e 08 LEA_BUF_IMM.slot1.wait0 @r4:r5, `r59, table:0xD, index:0x0 00 dd c0 08 14 c2 b2 00 FMA.f32 r2, r0, 0x44000000.neg.h1, 0x0.neg 41 88 c0 00 04 c1 b2 00 FMA.f32 r1, `r1, u8, 0x0.neg 40 88 c0 00 04 c0 b2 10 FMA.f32.wait1 r0, `r0, u8, 0x0.neg -44 00 00 32 06 40 61 78 STORE.i96.estream.slot0.return @r0:r1:r2, `r4, offset:0 -44 00 00 39 08 48 61 78 STORE.i128.istream.slot0.return @r8:r9:r10:r11, `r4, offset:0 -00 00 00 c0 01 c0 45 48 BARRIER.slot7.barrier +44 00 00 32 06 40 61 78 STORE.i96.estream.slot0.end @r0:r1:r2, `r4, offset:0 +44 00 00 39 08 48 61 78 STORE.i128.istream.slot0.end @r8:r9:r10:r11, `r4, offset:0 +00 00 00 c0 01 c0 45 48 BARRIER.slot7.wait 80 00 00 00 82 82 60 00 LOAD.i8.unsigned.slot0 @r2, u0, offset:0 80 00 00 08 82 82 60 00 LOAD.i16.unsigned.slot0 @r2, u0, offset:0 80 00 00 10 82 82 60 00 LOAD.i24.unsigned.slot0 @r2, u0, offset:0 @@ -106,9 +106,9 @@ c0 01 00 00 00 c4 10 51 IADD_IMM.i32.reconverge r4, 0x0, #0x1 42 00 00 38 08 44 61 00 STORE.i128.slot0 @r4:r5:r6:r7, `r2, offset:0 41 f8 ff ff 07 c0 1f 50 BRANCHZ.reconverge `r1, offset:-8 7d c0 00 08 10 bc a1 00 IADD.v2u16 r60.h1, `r61.h10, 0x0 -44 00 46 32 28 40 71 78 ST_CVT.slot0.istream.v4.f32.return @r0:r1:r2:r3, `r4, `r6, offset:0x0 -44 00 46 34 28 40 71 78 ST_CVT.slot0.istream.v4.s32.return @r0:r1:r2:r3, `r4, `r6, offset:0x0 -44 00 46 36 28 40 71 78 ST_CVT.slot0.istream.v4.u32.return @r0:r1:r2:r3, `r4, `r6, offset:0x0 +44 00 46 32 28 40 71 78 ST_CVT.slot0.istream.v4.f32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0 +44 00 46 34 28 40 71 78 ST_CVT.slot0.istream.v4.s32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0 +44 00 46 36 28 40 71 78 ST_CVT.slot0.istream.v4.u32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0 7c c0 12 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, `r60, 0x0, table:0x2, index:0x1 7c c0 02 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, `r60, 0x0, table:0x2, index:0x0 82 81 00 28 f4 82 6a 00 LD_BUFFER.i64.unsigned.slot0 @r2:r3, u2, u1 @@ -123,7 +123,7 @@ c0 01 00 00 00 c4 10 51 IADD_IMM.i32.reconverge r4, 0x0, #0x1 40 44 80 00 01 c0 b8 00 MUX.i32 r0, `r0, `r4, u0 40 44 80 00 02 c0 b8 00 MUX.i32.fp_zero r0, `r0, `r4, u0 40 44 80 00 03 c0 b8 00 MUX.i32.bit r0, `r0, `r4, u0 -00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.td r1, r0 +00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.discard r1, r0 01 00 02 00 00 c2 9c 00 FRSQ.f32 r2, r1 40 00 02 01 00 c0 99 00 FREXPE.f32.sqrt r0, `r0 41 42 c0 40 04 c0 62 41 FMA_RSCALE_LEFT.f32.wait0126 r0, `r1, `r2, 0x0.neg, `r0 @@ -180,14 +180,14 @@ c0 77 01 0c 00 c2 a8 00 ISUB.s32 r2, 0x0, `r55.h1 00 00 03 00 20 c1 90 00 V2S8_TO_V2F16 r1, r0.b20 40 00 03 00 60 c0 90 00 V2S8_TO_V2F16 r0, `r0.b21 -3d 00 00 b2 88 80 5c 68 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.td @r0:r1:r2:r3, r61, index:0x0 +3d 00 00 b2 88 80 5c 68 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.discard @r0:r1:r2:r3, r61, index:0x0 3d 00 10 72 18 84 5c 00 LD_VAR_BUF_IMM.f32.slot1.v4.src_f32.center.retrieve @r4:r5:r6:r7, r61, index:0x10 c0 00 00 00 00 c8 10 01 IADD_IMM.i32 r8, 0x0, #0x0 c0 00 00 00 00 c9 10 01 IADD_IMM.i32 r9, 0x0, #0x0 3d 00 14 00 00 ca 90 00 U16_TO_U32 r10, r61.h00 3d 09 00 00 30 c0 1f 50 BRANCHZ.eq.reconverge r61.h0, offset:9 0a 00 00 00 00 cb 91 50 MOV.i32.reconverge r11, r10 -00 00 00 00 00 c0 00 48 NOP.barrier +00 00 00 00 00 c0 00 48 NOP.wait 81 0b 80 33 04 8e 78 00 LD_TILE.v4.f16.slot0 @r14:r15, u1, r11, u0 0b 00 04 00 00 cc 91 00 CLZ.u32 r12, r11 82 4c c0 52 00 cc b4 00 RSHIFT_XOR.i32.not_result r12, u2, `r12.b00, 0x0 @@ -202,15 +202,15 @@ c0 00 00 00 00 c9 10 01 IADD_IMM.i32 r9, 0x0, #0x0 49 3e c0 22 04 c9 b3 30 FMA.v2f16.wait12 r9, `r9, r62.h00, 0x0.neg 47 43 00 00 00 c3 a4 00 FADD.f32 r3, `r7, `r3 43 09 00 08 00 c3 a4 40 FADD.f32.wait0126 r3, `r3, r9.h1 -3c 03 ea 00 02 bc 7d 68 ATEST.td @r60, r60, r3, atest_datum.w0 +3c 03 ea 00 02 bc 7d 68 ATEST.discard @r60, r60, r3, atest_datum.w0 46 42 00 00 00 c2 a4 00 FADD.f32 r2, `r6, `r2 44 40 00 00 00 c0 a4 00 FADD.f32 r0, `r4, `r0 48 7e c0 22 04 ff b3 00 FMA.v2f16 r63, `r8, `r62.h00, 0x0.neg 45 41 00 00 00 c1 a4 00 FADD.f32 r1, `r5, `r1 41 3f 00 08 00 c1 a4 00 FADD.f32 r1, `r1, r63.h1 40 7f 00 04 00 c0 a4 00 FADD.f32 r0, `r0, `r63.h0 -42 49 00 04 00 c2 a4 48 FADD.f32.barrier r2, `r2, `r9.h0 -f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.return @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0 +42 49 00 04 00 c2 a4 48 FADD.f32.wait r2, `r2, `r9.h0 +f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0 c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0 c0 f1 00 00 10 c1 2f 08 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1 80 00 c0 17 34 7c 25 01 TEX_FETCH.slot0.f.32.2d @r0:r1:r2:r3, @r60:r61, u0