From 614d07c986f60b661020e24a57326efaa84bf690 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Wed, 11 Mar 2026 11:12:14 +0100 Subject: [PATCH] pan/va: Generalize opcode/opcode2 Rather than opcode/opcode2 hardcoded, treat the opcode as a list of one or more subcodes. This implies modifying the disassembler to hold an arbitrary depth dict of dicts and recursively build the switch statements used to look up each level. Reviewed-by: Christoph Pillmayer Acked-by: Lorenzo Rossi Acked-by: Eric R. Smith Part-of: --- src/panfrost/compiler/bifrost/valhall/ISA.xml | 1264 ++++++++++++----- src/panfrost/compiler/bifrost/valhall/asm.py | 5 +- .../compiler/bifrost/valhall/disasm.py | 125 +- .../compiler/bifrost/valhall/valhall.c.py | 6 +- .../compiler/bifrost/valhall/valhall.py | 22 +- 5 files changed, 1017 insertions(+), 405 deletions(-) diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index dc8b1fa84ba..ed8dfeb0cc6 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -792,7 +792,9 @@ is a duplicate instruction in the Bifrost or pseudo XML files --> - + + + Do nothing. Useful at the start of a block for waiting on slots required by the first actual instruction of the block, to reconcile dependencies @@ -801,7 +803,9 @@ - + + + Branches to a specified relative offset if its source is nonzero (default) or if its source is zero (if `.eq` is set). The offset is 27-bits and @@ -824,7 +828,9 @@ - + + + Evaluates the given condition, and if it passes, discards the current fragment and terminates the thread. Only valid in a **fragment** shader. @@ -835,7 +841,9 @@ - + + + Jump to an indirectly specified (absolute or relative) address. Used to jump to blend shaders at the end of a fragment shader. @@ -848,7 +856,9 @@ - + + + General-purpose barrier. Must use slot #7. Must be paired with a `.wait` flow on the instruction. @@ -858,10 +868,14 @@ - + + + - + + + Evaluates the given condition and outputs either the true source or the @@ -876,16 +890,24 @@ - + + + - + + + - + + + - + + + Evaluates the given condition and outputs either the true source or the @@ -904,7 +926,9 @@ - + + + @@ -919,10 +943,14 @@ Interpolates a given varying from hardware buffer - + + + - + + + @@ -939,10 +967,14 @@ Interpolates a given varying from hardware buffer - + + + - + + + @@ -957,7 +989,9 @@ - + + + Interpolates a given varying from a software buffer @@ -971,7 +1005,9 @@ - + + + Interpolates a given varying from a software buffer @@ -986,7 +1022,9 @@ - + + + Fetches a given varying from a software buffer @@ -997,7 +1035,9 @@ - + + + Fetches a given varying from a software buffer @@ -1009,8 +1049,10 @@ - - + + + + Load `vecsize` components from the attribute descriptor at entry `index` of resource table `table` at index (vertex ID, instance ID), converting @@ -1028,8 +1070,10 @@ - - + + + + Load `vecsize` components from the attribute descriptor at the specified location at index (vertex ID, instance ID), converting @@ -1048,7 +1092,9 @@ - + + + Load the 64-bit global clock, either a cycle counter or the system clock. @@ -1056,8 +1102,10 @@ - - + + + + Load `vecsize` components from the texture descriptor at entry `index` of resource table `table`, converting @@ -1075,8 +1123,10 @@ - - + + + + Load `vecsize` components from the texture descriptor at the specified location at index, converting @@ -1093,8 +1143,10 @@ - - + + + + Load the effective address of an attribute specified with the given immediate index. Returns three staging register: the low/high @@ -1110,8 +1162,10 @@ - - + + + + Load the effective address of an attribute specified with the given index. Returns three staging register: the low/high @@ -1127,8 +1181,10 @@ - - + + + + Load the effective address of a texel from the image specified with the given immediate index. Returns three staging registers: the low/high @@ -1149,8 +1205,10 @@ - - + + + + Load the effective address of a texel from the image specified with the given index. Returns three staging register: the low/high @@ -1171,8 +1229,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1190,8 +1250,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1209,8 +1271,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1228,8 +1292,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1247,8 +1313,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1266,8 +1334,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1285,8 +1355,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1304,8 +1376,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1324,7 +1398,9 @@ - + + + Load effective address of a buffer with an offset added. @@ -1336,7 +1412,9 @@ - + + + Load effective address of a buffer with an immediate offset added. @@ -1349,8 +1427,10 @@ - - + + + + Loads from main memory @@ -1363,8 +1443,10 @@ - - + + + + Loads from main memory @@ -1377,8 +1459,10 @@ - - + + + + Loads from main memory @@ -1391,8 +1475,10 @@ - - + + + + Loads from main memory @@ -1405,8 +1491,10 @@ - - + + + + Loads from main memory @@ -1419,8 +1507,10 @@ - - + + + + Loads from main memory @@ -1433,8 +1523,10 @@ - - + + + + Loads from main memory @@ -1447,8 +1539,10 @@ - - + + + + Loads from main memory @@ -1461,32 +1555,55 @@ - Stores to main memory - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + @@ -1496,7 +1613,9 @@ - + + + Load effective address of a simple buffer with an offset added. @@ -1508,7 +1627,9 @@ - + + + Load from memory with data conversion. The address to load from is given in the first source, which must be a 64-bit register (a pair of 32-bit @@ -1526,7 +1647,9 @@ - + + + Store to memory with data conversion. The address to store to is given in the first source, which must be a 64-bit register (a pair of 32-bit @@ -1546,7 +1669,9 @@ - + + + Loads a given render target, specified in the pixel indices descriptor, at a given location and sample, and convert to the format specified in the @@ -1564,7 +1689,9 @@ - + + + Store to given render target, specified in the pixel indices descriptor, at a given location and sample, and convert to the format specified in the @@ -1581,7 +1708,9 @@ - + + + Blends a given render target. This loads the API-specified blend state for the render target from the first source. Blend descriptors are available @@ -1618,7 +1747,9 @@ - + + + Does alpha-to-coverage testing, updating the sample coverage mask. ATEST does not do an implicit discard. It should be executed before the first @@ -1632,7 +1763,9 @@ - + + + Programatically writes out depth, stencil, or both, depending on which modifiers are set. Used to implement gl_FragDepth and gl_FragStencil. @@ -1648,7 +1781,6 @@ - Performs the given data conversion. Note that floating-point rounding is handled via the same hardware and therefore shares an encoding. Round mode @@ -1657,20 +1789,32 @@ - + + + + - + + + + - + + + + - + + + + @@ -1678,141 +1822,197 @@ - Performs the given data conversion. - + + + + - + + + + - + + + + - + + + + Value to convert - Performs the given data conversion. - + + + + - + + + + Value to convert - Performs the given data conversion. - + + + + - + + + + - + + + + - + + + + Value to convert - - + + + + Converts up with the specified round mode. Value to convert - Performs the given data conversion. - + + + + - + + + + - + + + + - + + + + Value to convert - Performs the given data conversion. - + + + + - + + + + - + + + + - + + + + Value to convert - Performs the given rounding, using the convert unit. - + + + + - + + + + @@ -1820,15 +2020,19 @@ - - + + + + Canonical register-to-register move. - - + + + + Used as a primitive for various bitwise operations. @@ -1836,8 +2040,10 @@ - - + + + + Used as a primitive for various bitwise operations. @@ -1845,8 +2051,10 @@ - - + + + + Used as a primitive for various bitwise operations. @@ -1854,8 +2062,10 @@ - - + + + + 64-bit abs may be constructed in 4 instructions (5 clocks) by checking the sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with @@ -1865,21 +2075,27 @@ - - + + + + - - + + + + - - + + + + Only available as 32-bit. Smaller bitsizes require explicit conversions. 64-bit popcount may be constructed in 3 clocks by separate 32-bit @@ -1890,8 +2106,10 @@ - - + + + + Only available as 32-bit. Other bitsizes may be derived with swizzles. @@ -1899,8 +2117,10 @@ - - + + + + For fully featured bitwise operation, see the shift opcodes. @@ -1908,8 +2128,10 @@ - - + + + + For fully featured bitwise operation, see the shift opcodes. @@ -1917,7 +2139,9 @@ - + + + Returns the mask of lanes ever active within the warp (subgroup), such that the source is nonzero. The number of work-items in a subgroup is @@ -1934,12 +2158,17 @@ - - + + + + - + + + + Flush special float values. The ftz modifier flushes subnormal values to @@ -1954,20 +2183,31 @@ - - + + + + - + + + + - + + + + - + + + + Breaks up the floating-point input into its fractional (mantissa) and @@ -1982,36 +2222,65 @@ - - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + Performs a given special function. The floating-point reciprocal (`FRCP`) @@ -2025,18 +2294,29 @@ - - + + + + - + + + + - + + + + - + + + + Performs a given special function. The trigonometric tables @@ -2047,12 +2327,17 @@ - - + + + + - + + + + $A + B$ @@ -2063,12 +2348,17 @@ - - + + + + - + + + + $\min \{ A, B \}$ @@ -2077,12 +2367,17 @@ - - + + + + - + + + + $\max \{ A, B \}$ @@ -2092,9 +2387,11 @@ - - + + + + Given a pair of 32-bit floats, output a pair of 16-bit floats packed into @@ -2107,12 +2404,17 @@ - - + + + + - + + + + Computes $A \cdot 2^B$ by adding B to the exponent of A. Used to calculate @@ -2127,8 +2429,10 @@ - - + + + + Calculates the base-2 exponent of an argument specified as a 8:24 fixed-point. The original argument is passed as well for correct handling @@ -2140,8 +2444,10 @@ - - + + + + Performs a floating-point addition specialized for logarithm computation. @@ -2151,8 +2457,10 @@ - - + + + + Used for `atan2()` implementation. Destination is two 16-bit values (int and float) for the first form, and a single 32-bit float when @@ -2164,7 +2472,6 @@ - $A + B$ with optional saturation. @@ -2172,30 +2479,54 @@ canonical lowering for swizzles. - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + A B @@ -2203,40 +2534,65 @@ - - + + + + Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)` A B - - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + $A - B$ with optional saturation A @@ -2245,7 +2601,6 @@ - Similar to SHADDX, but especially used for loading offsets into WLS. Usually this is only required for atomic operations, which cannot @@ -2254,7 +2609,10 @@ .neg indicates SEG_SUB instead. - + + + + @@ -2263,17 +2621,22 @@ - Sign or zero extend B to 64-bits, left-shift by `shift`, and add the 64-bit value A. These instructions accelerate address arithmetic, but may be used in full generality for 64-bit integer arithmetic. - + + + + - + + + + A @@ -2281,27 +2644,47 @@ - - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + $A \cdot B$ with optional saturation. Note the multipliers can only handle up to @@ -2317,24 +2700,41 @@ - - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + A @@ -2347,8 +2747,10 @@ - - + + + + Selects the value of A in the subgroup lane given by B. This implements subgroup broadcasts. It may be used as a primitive for screen space @@ -2363,10 +2765,14 @@ - + + + - + + + $A \cdot B + C$ @@ -2377,18 +2783,29 @@ - - + + + + - + + + + - + + + + - + + + + Left shifts its first source by a specified amount and bitwise ANDs it with the @@ -2401,18 +2818,29 @@ - - + + + + - + + + + - + + + + - + + + + Right shifts its first source by a specified amount and bitwise ANDs it with the @@ -2428,18 +2856,29 @@ - - + + + + - + + + + - + + + + - + + + + Left shifts its first source by a specified amount and bitwise ORs it with the @@ -2452,18 +2891,29 @@ - - + + + + - + + + + - + + + + - + + + + Right shifts its first source by a specified amount and bitwise ORs it with the @@ -2479,18 +2929,29 @@ - - + + + + - + + + + - + + + + - + + + + Left shifts its first source by a specified amount and bitwise XORs it with the @@ -2503,18 +2964,29 @@ - - + + + + - + + + + - + + + + - + + + + Right shifts its first source by a specified amount and bitwise XORs it with the @@ -2530,7 +3002,9 @@ - + + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -2545,7 +3019,9 @@ - + + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -2560,7 +3036,9 @@ - + + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -2575,8 +3053,10 @@ - - + + + + During a cube map transform, select the S coordinate given a selected face. Z coordinate as 32-bit floating point X coordinate as 32-bit floating point @@ -2584,8 +3064,10 @@ - - + + + + During a cube map transform, select the T coordinate given a selected face. Y coordinate as 32-bit floating point Z coordinate as 32-bit floating point @@ -2593,7 +3075,9 @@ - + + + Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD. @@ -2609,7 +3093,9 @@ - + + + Select the maximum absolute value of its arguments. X coordinate as 32-bit floating point Y coordinate as 32-bit floating point @@ -2617,7 +3103,9 @@ - + + + Select the cube face index corresponding to the arguments. X coordinate as 32-bit floating point Y coordinate as 32-bit floating point @@ -2625,7 +3113,6 @@ - 8-bit integer dot product between 4 channel vectors, intended for machine learning. Available in both unsigned and signed variants, controlling @@ -2638,10 +3125,16 @@ saturates. - + + + + - + + + + A B @@ -2650,7 +3143,6 @@ - Evaluates the given condition, do a logical or with the condition in the result source, and return in the given result type (integer @@ -2659,14 +3151,23 @@ when this is not desired, tie it to zero. - + + + + - + + + + - + + + + @@ -2676,7 +3177,6 @@ - Evaluates the given condition, do a logical and with the condition in the result source, and return in the given result type (integer @@ -2684,14 +3184,23 @@ for chaining together conditions without intermediate bitwise arithmetic. - + + + + - + + + + - + + + + @@ -2701,7 +3210,6 @@ - Evaluates the given condition, do a logical or with the condition in the result source, and return in the given result type (integer @@ -2710,10 +3218,16 @@ when this is not desired, tie it to zero. - + + + + - + + + + @@ -2723,7 +3237,6 @@ - Evaluates the given condition, do a logical and/or with the condition in the result source, and return in the given result type (integer @@ -2731,10 +3244,16 @@ for chaining together conditions without intermediate bitwise arithmetic. - + + + + - + + + + @@ -2744,7 +3263,6 @@ - Evaluates the given condition, do a logical or with the condition in the result source, and return in the given result type (integer @@ -2752,14 +3270,23 @@ for chaining together conditions without intermediate bitwise arithmetic. - + + + + - + + + + - + + + + @@ -2769,7 +3296,6 @@ - Evaluates the given condition, do a logical and with the condition in the result source, and return in the given result type (integer @@ -2777,14 +3303,23 @@ for chaining together conditions without intermediate bitwise arithmetic. - + + + + - + + + + - + + + + @@ -2794,7 +3329,6 @@ - Evaluates the given condition, do a logical and/or with the condition in the result source, and return in the given result type (integer @@ -2809,10 +3343,16 @@ the result of the low half comparison passed as the third source. - + + + + - + + + + @@ -2822,7 +3362,9 @@ - + + + Adds an arbitrary 32-bit immediate embedded within the instruction stream. If no modifiers are required, this is preferred to `IADD.i32` with a @@ -2836,7 +3378,9 @@ - + + + Adds an arbitrary pair of 16-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -2850,7 +3394,9 @@ - + + + Adds an arbitrary quad of 8-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -2863,7 +3409,9 @@ - + + + Adds an arbitrary 32-bit immediate embedded within the instruction stream. If no modifiers are required, this is preferred to `FADD.f32` with a @@ -2875,7 +3423,9 @@ - + + + Adds an arbitrary pair of 16-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -2888,8 +3438,10 @@ - - + + + + @@ -2901,8 +3453,10 @@ - - + + + + @@ -2914,8 +3468,10 @@ - - + + + + @@ -2926,8 +3482,10 @@ - - + + + + @@ -2938,8 +3496,10 @@ - - + + + + @@ -2956,8 +3516,10 @@ - - + + + + @@ -2974,7 +3536,9 @@ - + + + Unfiltered textured instruction. @@ -2999,7 +3563,9 @@ - + + + Ordinary texturing instruction using a sampler. @@ -3026,7 +3592,9 @@ - + + + Texture gather instruction. @@ -3054,7 +3622,9 @@ - + + + Texture sample with explicit gradient. @@ -3079,7 +3649,9 @@ - + + + Pair of texture instructions. @@ -3103,7 +3675,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3126,7 +3700,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3150,7 +3726,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3174,7 +3752,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units. @@ -3197,7 +3777,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3220,7 +3802,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3244,7 +3828,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3268,7 +3854,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units. @@ -3291,7 +3879,9 @@ - + + + First calculates $A \cdot B + C$ and then biases the exponent by D. Used in special transcendental function sequences. It should not be used for @@ -3307,7 +3897,9 @@ - + + + First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply $A \cdot B$ is treated as zero even if an @@ -3324,7 +3916,9 @@ - + + + First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply is treated as $A$ even if an @@ -3341,7 +3935,9 @@ - + + + First calculates $A \cdot B + C$ and then biases the exponent by D, interpreted as a 16-bit value. Used in special transcendental function diff --git a/src/panfrost/compiler/bifrost/valhall/asm.py b/src/panfrost/compiler/bifrost/valhall/asm.py index ba4127fdc6d..2e001fda929 100644 --- a/src/panfrost/compiler/bifrost/valhall/asm.py +++ b/src/panfrost/compiler/bifrost/valhall/asm.py @@ -315,9 +315,8 @@ def parse_asm(line): operands = operands[len(ins.immediates):] # Encode the operation itself - encoded |= (ins.opcode.value << ins.opcode.start) - if ins.opcode2: - encoded |= (ins.opcode2.value << ins.opcode2.start) + for subcode in ins.opcode: + encoded |= (subcode.value << subcode.start) # Encode FAU page if fau.page: diff --git a/src/panfrost/compiler/bifrost/valhall/disasm.py b/src/panfrost/compiler/bifrost/valhall/disasm.py index 030423ef013..a627dd8ac5c 100644 --- a/src/panfrost/compiler/bifrost/valhall/disasm.py +++ b/src/panfrost/compiler/bifrost/valhall/disasm.py @@ -194,39 +194,42 @@ va_print_dest(FILE *fp, uint8_t dest, bool can_mask) % endfor +<%def name="recurse_subcodes(op_bucket)"> +%if op_bucket.instr: +${print_instr(op_bucket.instr)} +%else: + opcode = (instr >> ${op_bucket.start}) & ${hex(op_bucket.mask)}; + switch (opcode) { +%for op in op_bucket.children: + case ${hex(op)}: + { +${recurse_subcodes(op_bucket.children[op])} + break; + } +%endfor + } +%endif + + + void va_disasm_instr(FILE *fp, uint64_t instr) { - unsigned primary_opc = (instr >> 48) & MASK(9); + unsigned opcode; unsigned fau_page = (instr >> 57) & MASK(2); - unsigned secondary_opc = 0; - switch (primary_opc) { -% for bucket in OPCODES: - <% - ops = OPCODES[bucket] - ambiguous = (len(ops) > 1) - %> -% if len(ops) > 0: - case ${hex(bucket)}: -% if ambiguous: - secondary_opc = (instr >> ${ops[0].opcode2.start}) & ${hex(ops[0].opcode2.mask)}; -% endif -% for op in ops: -% if ambiguous: +${recurse_subcodes(OPCODES)} +} - if (secondary_opc == ${op.opcode2.value}) { -% endif -${print_instr(op)} -% if ambiguous: - } -% endif -% endfor - break; - -% endif -% endfor - } +static bool is_branch(uint64_t instr) +{ +<% (exact, mask) = OPCODES.get_exact_mask("BRANCHZ") %> + if ((instr & ${hex(mask)}) == ${hex(exact)}) + return true; +<% (exact, mask) = OPCODES.get_exact_mask("BRANCHZI") %> + if ((instr & ${hex(mask)}) == ${hex(exact)}) + return true; + return false; } void @@ -259,13 +262,8 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose) va_disasm_instr(fp, instr); fprintf(fp, "\\n"); - /* Detect branches */ - uint64_t opcode = (instr >> 48) & MASK(9); - bool branchz = (opcode == 0x1F); - bool branchzi = (opcode == 0x2F); - /* Separate blocks visually by inserting whitespace after branches */ - if (branchz || branchzi) + if (is_branch(instr)) fprintf(fp, "\\n"); } @@ -273,30 +271,47 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose) } """ -# Bucket by opcode for hierarchical disassembly -OPCODE_BUCKETS = {} +class OpBucket: + def __init__(self): + self.start = None + self.mask = None + self.instr = None + self.children = {} + + def insert(self, subcodes, ins): + if len(subcodes) == 0: + self.instr = ins + else: + sc = subcodes[0] + assert(self.start is None or self.start == sc.start) + assert(self.mask is None or self.mask == sc.mask) + self.start = sc.start + self.mask = sc.mask + if sc.value not in self.children: + self.children[sc.value] = OpBucket() + self.children[sc.value].insert(subcodes[1:], ins) + + def get_exact_mask(self, op_name, exact = 0, mask = 0): + if self.instr: + if self.instr.name == op_name: + return (exact, mask) + else: + return () + else: + for op in self.children: + exact_mask = self.children[op].get_exact_mask(op_name, + exact | (op << self.start), + mask | (self.mask << self.start)) + if exact_mask: + return exact_mask + return () + +# Build opcode hierarchy: +OPCODES = OpBucket() for ins in instructions: - opc = ins.opcode.value - OPCODE_BUCKETS[opc] = OPCODE_BUCKETS.get(opc, []) + [ins] - -# Check that each bucket may be disambiguated -for op in OPCODE_BUCKETS: - bucket = OPCODE_BUCKETS[op] - - # Nothing to disambiguate - if len(bucket) < 2: - continue - - SECONDARY = {} - for ins in bucket: - # Number of sources determines opcode2 placement, must be consistent - assert(len(ins.srcs) == len(bucket[0].srcs)) - - # Must not repeat, else we're ambiguous - assert(ins.opcode2.value not in SECONDARY) - SECONDARY[ins.opcode2.value] = ins + OPCODES.insert(ins.opcode, ins) try: - print(Template(template).render(OPCODES = OPCODE_BUCKETS, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name)) + print(Template(template).render(OPCODES = OPCODES, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name)) except: print(exceptions.text_error_template().render()) diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.c.py b/src/panfrost/compiler/bifrost/valhall/valhall.c.py index 91d4f7c0c07..3645092b836 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py @@ -147,9 +147,9 @@ valhall_opcodes[BI_NUM_OPCODES] = { # Exact value to be ORed in to every opcode def exact_op(op): - exact_op = (op.opcode.value << op.opcode.start) - if op.opcode2: - exact_op |= (op.opcode2.value << op.opcode2.start) + exact_op = 0 + for subcode in op.opcode: + exact_op |= (subcode.value << subcode.start) return exact_op try: diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.py b/src/panfrost/compiler/bifrost/valhall/valhall.py index cddc277d2aa..366fc3c240c 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.py @@ -164,12 +164,11 @@ class Opcode: self.mask = mask class Instruction: - def __init__(self, name, opcode, opcode2, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None): + def __init__(self, name, opcode, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None): self.name = name self.srcs = srcs self.dests = dests self.opcode = opcode - self.opcode2 = opcode2 self.immediates = immediates self.modifiers = modifiers self.staging = staging @@ -180,7 +179,6 @@ class Instruction: self.message = unit not in ["FMA", "CVT", "SFU"] assert(len(dests) == 0 or not staging) - assert(not opcode2 or (opcode2.value & opcode2.mask) == opcode2.value) def __str__(self): return self.name @@ -226,20 +224,25 @@ def build_modifier(el): return Modifier(name, start, size, implied) def build_opcode(el, name): + op_arr = [] opcode = el.find(name) if opcode is None: return None - value = int(opcode.get('val'), base=0) - start = int(opcode.get('start')) - mask = int(opcode.get('mask'), base=0) - return Opcode(value, start, mask) + + for subcode in opcode: + value = int(subcode.get('val'), base=0) + start = int(subcode.get('start')) + mask = int(subcode.get('mask'), base=0) + assert((value & mask) == value) + op_arr.append(Opcode(value, start, mask)) + + return op_arr # Build a single instruction from XML and group based overrides def build_instr(el, overrides = {}): # Get overridables name = overrides.get('name') or el.attrib.get('name') opcode = overrides.get('opcode') or build_opcode(el, 'opcode') - opcode2 = overrides.get('opcode2') or build_opcode(el, 'opcode2') unit = overrides.get('unit') or el.attrib.get('unit') # Get explicit sources/dests @@ -279,7 +282,7 @@ def build_instr(el, overrides = {}): elif mod.tag =='va_mod': modifiers.append(build_modifier(mod)) - instr = Instruction(name, opcode, opcode2, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit) + instr = Instruction(name, opcode, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit) instructions.append(instr) @@ -290,7 +293,6 @@ def build_group(el): build_instr(el, overrides = { 'name': ins.attrib['name'], 'opcode': build_opcode(ins, 'opcode'), - 'opcode2': build_opcode(ins, 'opcode2'), 'unit': ins.attrib.get('unit'), })