diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml
index dc8b1fa84ba..ed8dfeb0cc6 100644
--- a/src/panfrost/compiler/bifrost/valhall/ISA.xml
+++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml
@@ -792,7 +792,9 @@
is a duplicate instruction in the Bifrost or pseudo XML files
-->
-
+
+
+
Do nothing. Useful at the start of a block for waiting on slots required
by the first actual instruction of the block, to reconcile dependencies
@@ -801,7 +803,9 @@
-
+
+
+
Branches to a specified relative offset if its source is nonzero (default)
or if its source is zero (if `.eq` is set). The offset is 27-bits and
@@ -824,7 +828,9 @@
-
+
+
+
Evaluates the given condition, and if it passes, discards the current
fragment and terminates the thread. Only valid in a **fragment** shader.
@@ -835,7 +841,9 @@
-
+
+
+
Jump to an indirectly specified (absolute or relative) address. Used to
jump to blend shaders at the end of a fragment shader.
@@ -848,7 +856,9 @@
-
+
+
+
General-purpose barrier. Must use slot #7. Must be paired with a
`.wait` flow on the instruction.
@@ -858,10 +868,14 @@
-
+
+
+
-
+
+
+
Evaluates the given condition and outputs either the true source or the
@@ -876,16 +890,24 @@
-
+
+
+
-
+
+
+
-
+
+
+
-
+
+
+
Evaluates the given condition and outputs either the true source or the
@@ -904,7 +926,9 @@
-
+
+
+
@@ -919,10 +943,14 @@
Interpolates a given varying from hardware buffer
-
+
+
+
-
+
+
+
@@ -939,10 +967,14 @@
Interpolates a given varying from hardware buffer
-
+
+
+
-
+
+
+
@@ -957,7 +989,9 @@
-
+
+
+
Interpolates a given varying from a software buffer
@@ -971,7 +1005,9 @@
-
+
+
+
Interpolates a given varying from a software buffer
@@ -986,7 +1022,9 @@
-
+
+
+
Fetches a given varying from a software buffer
@@ -997,7 +1035,9 @@
-
+
+
+
Fetches a given varying from a software buffer
@@ -1009,8 +1049,10 @@
-
-
+
+
+
+
Load `vecsize` components from the attribute descriptor at entry `index`
of resource table `table` at index (vertex ID, instance ID), converting
@@ -1028,8 +1070,10 @@
-
-
+
+
+
+
Load `vecsize` components from the attribute descriptor at the specified
location at index (vertex ID, instance ID), converting
@@ -1048,7 +1092,9 @@
-
+
+
+
Load the 64-bit global clock, either a cycle counter or the system clock.
@@ -1056,8 +1102,10 @@
-
-
+
+
+
+
Load `vecsize` components from the texture descriptor at entry `index`
of resource table `table`, converting
@@ -1075,8 +1123,10 @@
-
-
+
+
+
+
Load `vecsize` components from the texture descriptor at the specified
location at index, converting
@@ -1093,8 +1143,10 @@
-
-
+
+
+
+
Load the effective address of an attribute specified with the
given immediate index. Returns three staging register: the low/high
@@ -1110,8 +1162,10 @@
-
-
+
+
+
+
Load the effective address of an attribute specified with the
given index. Returns three staging register: the low/high
@@ -1127,8 +1181,10 @@
-
-
+
+
+
+
Load the effective address of a texel from the image specified with the
given immediate index. Returns three staging registers: the low/high
@@ -1149,8 +1205,10 @@
-
-
+
+
+
+
Load the effective address of a texel from the image specified with the
given index. Returns three staging register: the low/high
@@ -1171,8 +1229,10 @@
-
-
+
+
+
+
Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
all-ones, load from the buffer descriptors in the table indexed by the
@@ -1190,8 +1250,10 @@
-
-
+
+
+
+
Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
all-ones, load from the buffer descriptors in the table indexed by the
@@ -1209,8 +1271,10 @@
-
-
+
+
+
+
Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
all-ones, load from the buffer descriptors in the table indexed by the
@@ -1228,8 +1292,10 @@
-
-
+
+
+
+
Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
all-ones, load from the buffer descriptors in the table indexed by the
@@ -1247,8 +1313,10 @@
-
-
+
+
+
+
Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
all-ones, load from the buffer descriptors in the table indexed by the
@@ -1266,8 +1334,10 @@
-
-
+
+
+
+
Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
all-ones, load from the buffer descriptors in the table indexed by the
@@ -1285,8 +1355,10 @@
-
-
+
+
+
+
Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
all-ones, load from the buffer descriptors in the table indexed by the
@@ -1304,8 +1376,10 @@
-
-
+
+
+
+
Loads a buffer descriptor. If bits 25...31 of the mode descriptor are
all-ones, load from the buffer descriptors in the table indexed by the
@@ -1324,7 +1398,9 @@
-
+
+
+
Load effective address of a buffer with an offset added.
@@ -1336,7 +1412,9 @@
-
+
+
+
Load effective address of a buffer with an immediate offset added.
@@ -1349,8 +1427,10 @@
-
-
+
+
+
+
Loads from main memory
@@ -1363,8 +1443,10 @@
-
-
+
+
+
+
Loads from main memory
@@ -1377,8 +1459,10 @@
-
-
+
+
+
+
Loads from main memory
@@ -1391,8 +1475,10 @@
-
-
+
+
+
+
Loads from main memory
@@ -1405,8 +1491,10 @@
-
-
+
+
+
+
Loads from main memory
@@ -1419,8 +1507,10 @@
-
-
+
+
+
+
Loads from main memory
@@ -1433,8 +1523,10 @@
-
-
+
+
+
+
Loads from main memory
@@ -1447,8 +1539,10 @@
-
-
+
+
+
+
Loads from main memory
@@ -1461,32 +1555,55 @@
-
Stores to main memory
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
@@ -1496,7 +1613,9 @@
-
+
+
+
Load effective address of a simple buffer with an offset added.
@@ -1508,7 +1627,9 @@
-
+
+
+
Load from memory with data conversion. The address to load from is given in
the first source, which must be a 64-bit register (a pair of 32-bit
@@ -1526,7 +1647,9 @@
-
+
+
+
Store to memory with data conversion. The address to store to is given in
the first source, which must be a 64-bit register (a pair of 32-bit
@@ -1546,7 +1669,9 @@
-
+
+
+
Loads a given render target, specified in the pixel indices descriptor, at
a given location and sample, and convert to the format specified in the
@@ -1564,7 +1689,9 @@
-
+
+
+
Store to given render target, specified in the pixel indices descriptor, at
a given location and sample, and convert to the format specified in the
@@ -1581,7 +1708,9 @@
-
+
+
+
Blends a given render target. This loads the API-specified blend state for
the render target from the first source. Blend descriptors are available
@@ -1618,7 +1747,9 @@
-
+
+
+
Does alpha-to-coverage testing, updating the sample coverage mask. ATEST
does not do an implicit discard. It should be executed before the first
@@ -1632,7 +1763,9 @@
-
+
+
+
Programatically writes out depth, stencil, or both, depending on which
modifiers are set. Used to implement gl_FragDepth and gl_FragStencil.
@@ -1648,7 +1781,6 @@
-
Performs the given data conversion. Note that floating-point rounding is
handled via the same hardware and therefore shares an encoding. Round mode
@@ -1657,20 +1789,32 @@
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
@@ -1678,141 +1822,197 @@
-
Performs the given data conversion.
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Value to convert
-
Performs the given data conversion.
-
+
+
+
+
-
+
+
+
+
Value to convert
-
Performs the given data conversion.
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Value to convert
-
-
+
+
+
+
Converts up with the specified round mode.
Value to convert
-
Performs the given data conversion.
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Value to convert
-
Performs the given data conversion.
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Value to convert
-
Performs the given rounding, using the convert unit.
-
+
+
+
+
-
+
+
+
+
@@ -1820,15 +2020,19 @@
-
-
+
+
+
+
Canonical register-to-register move.
-
-
+
+
+
+
Used as a primitive for various bitwise operations.
@@ -1836,8 +2040,10 @@
-
-
+
+
+
+
Used as a primitive for various bitwise operations.
@@ -1845,8 +2051,10 @@
-
-
+
+
+
+
Used as a primitive for various bitwise operations.
@@ -1854,8 +2062,10 @@
-
-
+
+
+
+
64-bit abs may be constructed in 4 instructions (5 clocks) by checking the
sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with
@@ -1865,21 +2075,27 @@
-
-
+
+
+
+
-
-
+
+
+
+
-
-
+
+
+
+
Only available as 32-bit. Smaller bitsizes require explicit conversions.
64-bit popcount may be constructed in 3 clocks by separate 32-bit
@@ -1890,8 +2106,10 @@
-
-
+
+
+
+
Only available as 32-bit. Other bitsizes may be derived with swizzles.
@@ -1899,8 +2117,10 @@
-
-
+
+
+
+
For fully featured bitwise operation, see the shift opcodes.
@@ -1908,8 +2128,10 @@
-
-
+
+
+
+
For fully featured bitwise operation, see the shift opcodes.
@@ -1917,7 +2139,9 @@
-
+
+
+
Returns the mask of lanes ever active within the warp (subgroup), such
that the source is nonzero. The number of work-items in a subgroup is
@@ -1934,12 +2158,17 @@
-
-
+
+
+
+
-
+
+
+
+
Flush special float values. The ftz modifier flushes subnormal values to
@@ -1954,20 +2183,31 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Breaks up the floating-point input into its fractional (mantissa) and
@@ -1982,36 +2222,65 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Performs a given special function. The floating-point reciprocal (`FRCP`)
@@ -2025,18 +2294,29 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Performs a given special function. The trigonometric tables
@@ -2047,12 +2327,17 @@
-
-
+
+
+
+
-
+
+
+
+
$A + B$
@@ -2063,12 +2348,17 @@
-
-
+
+
+
+
-
+
+
+
+
$\min \{ A, B \}$
@@ -2077,12 +2367,17 @@
-
-
+
+
+
+
-
+
+
+
+
$\max \{ A, B \}$
@@ -2092,9 +2387,11 @@
-
-
+
+
+
+
Given a pair of 32-bit floats, output a pair of 16-bit floats packed into
@@ -2107,12 +2404,17 @@
-
-
+
+
+
+
-
+
+
+
+
Computes $A \cdot 2^B$ by adding B to the exponent of A. Used to calculate
@@ -2127,8 +2429,10 @@
-
-
+
+
+
+
Calculates the base-2 exponent of an argument specified as a 8:24
fixed-point. The original argument is passed as well for correct handling
@@ -2140,8 +2444,10 @@
-
-
+
+
+
+
Performs a floating-point addition specialized for logarithm computation.
@@ -2151,8 +2457,10 @@
-
-
+
+
+
+
Used for `atan2()` implementation. Destination is two 16-bit
values (int and float) for the first form, and a single 32-bit float when
@@ -2164,7 +2472,6 @@
-
$A + B$ with optional saturation.
@@ -2172,30 +2479,54 @@
canonical lowering for swizzles.
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
A
B
@@ -2203,40 +2534,65 @@
-
-
+
+
+
+
Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)`
A
B
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
$A - B$ with optional saturation
A
@@ -2245,7 +2601,6 @@
-
Similar to SHADDX, but especially used for loading offsets into
WLS. Usually this is only required for atomic operations, which cannot
@@ -2254,7 +2609,10 @@
.neg indicates SEG_SUB instead.
-
+
+
+
+
@@ -2263,17 +2621,22 @@
-
Sign or zero extend B to 64-bits, left-shift by `shift`, and add the
64-bit value A. These instructions accelerate address arithmetic, but may
be used in full generality for 64-bit integer arithmetic.
-
+
+
+
+
-
+
+
+
+
A
@@ -2281,27 +2644,47 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
$A \cdot B$ with optional saturation. Note the multipliers can only handle up to
@@ -2317,24 +2700,41 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
A
@@ -2347,8 +2747,10 @@
-
-
+
+
+
+
Selects the value of A in the subgroup lane given by B. This implements
subgroup broadcasts. It may be used as a primitive for screen space
@@ -2363,10 +2765,14 @@
-
+
+
+
-
+
+
+
$A \cdot B + C$
@@ -2377,18 +2783,29 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Left shifts its first source by a specified amount and bitwise ANDs it with the
@@ -2401,18 +2818,29 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Right shifts its first source by a specified amount and bitwise ANDs it with the
@@ -2428,18 +2856,29 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Left shifts its first source by a specified amount and bitwise ORs it with the
@@ -2452,18 +2891,29 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Right shifts its first source by a specified amount and bitwise ORs it with the
@@ -2479,18 +2929,29 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Left shifts its first source by a specified amount and bitwise XORs it with the
@@ -2503,18 +2964,29 @@
-
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
Right shifts its first source by a specified amount and bitwise XORs it with the
@@ -2530,7 +3002,9 @@
-
+
+
+
Mux between A and B based on the provided mask. The condition specified
as the `mux` modifier is evaluated on the mask. If true, `A` is chosen,
@@ -2545,7 +3019,9 @@
-
+
+
+
Mux between A and B based on the provided mask. The condition specified
as the `mux` modifier is evaluated on the mask. If true, `A` is chosen,
@@ -2560,7 +3036,9 @@
-
+
+
+
Mux between A and B based on the provided mask. The condition specified
as the `mux` modifier is evaluated on the mask. If true, `A` is chosen,
@@ -2575,8 +3053,10 @@
-
-
+
+
+
+
During a cube map transform, select the S coordinate given a selected face.
Z coordinate as 32-bit floating point
X coordinate as 32-bit floating point
@@ -2584,8 +3064,10 @@
-
-
+
+
+
+
During a cube map transform, select the T coordinate given a selected face.
Y coordinate as 32-bit floating point
Z coordinate as 32-bit floating point
@@ -2593,7 +3075,9 @@
-
+
+
+
Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD.
@@ -2609,7 +3093,9 @@
-
+
+
+
Select the maximum absolute value of its arguments.
X coordinate as 32-bit floating point
Y coordinate as 32-bit floating point
@@ -2617,7 +3103,9 @@
-
+
+
+
Select the cube face index corresponding to the arguments.
X coordinate as 32-bit floating point
Y coordinate as 32-bit floating point
@@ -2625,7 +3113,6 @@
-
8-bit integer dot product between 4 channel vectors, intended for machine
learning. Available in both unsigned and signed variants, controlling
@@ -2638,10 +3125,16 @@
saturates.
-
+
+
+
+
-
+
+
+
+
A
B
@@ -2650,7 +3143,6 @@
-
Evaluates the given condition, do a logical or with the condition in
the result source, and return in the given result type (integer
@@ -2659,14 +3151,23 @@
when this is not desired, tie it to zero.
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
@@ -2676,7 +3177,6 @@
-
Evaluates the given condition, do a logical and with the condition in
the result source, and return in the given result type (integer
@@ -2684,14 +3184,23 @@
for chaining together conditions without intermediate bitwise arithmetic.
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
@@ -2701,7 +3210,6 @@
-
Evaluates the given condition, do a logical or with the condition in
the result source, and return in the given result type (integer
@@ -2710,10 +3218,16 @@
when this is not desired, tie it to zero.
-
+
+
+
+
-
+
+
+
+
@@ -2723,7 +3237,6 @@
-
Evaluates the given condition, do a logical and/or with the condition in
the result source, and return in the given result type (integer
@@ -2731,10 +3244,16 @@
for chaining together conditions without intermediate bitwise arithmetic.
-
+
+
+
+
-
+
+
+
+
@@ -2744,7 +3263,6 @@
-
Evaluates the given condition, do a logical or with the condition in
the result source, and return in the given result type (integer
@@ -2752,14 +3270,23 @@
for chaining together conditions without intermediate bitwise arithmetic.
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
@@ -2769,7 +3296,6 @@
-
Evaluates the given condition, do a logical and with the condition in
the result source, and return in the given result type (integer
@@ -2777,14 +3303,23 @@
for chaining together conditions without intermediate bitwise arithmetic.
-
+
+
+
+
-
+
+
+
+
-
+
+
+
+
@@ -2794,7 +3329,6 @@
-
Evaluates the given condition, do a logical and/or with the condition in
the result source, and return in the given result type (integer
@@ -2809,10 +3343,16 @@
the result of the low half comparison passed as the third source.
-
+
+
+
+
-
+
+
+
+
@@ -2822,7 +3362,9 @@
-
+
+
+
Adds an arbitrary 32-bit immediate embedded within the instruction stream.
If no modifiers are required, this is preferred to `IADD.i32` with a
@@ -2836,7 +3378,9 @@
-
+
+
+
Adds an arbitrary pair of 16-bit immediates embedded within the
instruction stream. If no modifiers are required, this is preferred to
@@ -2850,7 +3394,9 @@
-
+
+
+
Adds an arbitrary quad of 8-bit immediates embedded within the
instruction stream. If no modifiers are required, this is preferred to
@@ -2863,7 +3409,9 @@
-
+
+
+
Adds an arbitrary 32-bit immediate embedded within the instruction stream.
If no modifiers are required, this is preferred to `FADD.f32` with a
@@ -2875,7 +3423,9 @@
-
+
+
+
Adds an arbitrary pair of 16-bit immediates embedded within the
instruction stream. If no modifiers are required, this is preferred to
@@ -2888,8 +3438,10 @@
-
-
+
+
+
+
@@ -2901,8 +3453,10 @@
-
-
+
+
+
+
@@ -2914,8 +3468,10 @@
-
-
+
+
+
+
@@ -2926,8 +3482,10 @@
-
-
+
+
+
+
@@ -2938,8 +3496,10 @@
-
-
+
+
+
+
@@ -2956,8 +3516,10 @@
-
-
+
+
+
+
@@ -2974,7 +3536,9 @@
-
+
+
+
Unfiltered textured instruction.
@@ -2999,7 +3563,9 @@
-
+
+
+
Ordinary texturing instruction using a sampler.
@@ -3026,7 +3592,9 @@
-
+
+
+
Texture gather instruction.
@@ -3054,7 +3622,9 @@
-
+
+
+
Texture sample with explicit gradient.
@@ -3079,7 +3649,9 @@
-
+
+
+
Pair of texture instructions.
@@ -3103,7 +3675,9 @@
-
+
+
+
Only works for FP32 varyings. Performance characteristics are similar
to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units.
@@ -3126,7 +3700,9 @@
-
+
+
+
Only works for FP32 varyings. Performance characteristics are similar
to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units.
@@ -3150,7 +3726,9 @@
-
+
+
+
Only works for FP32 varyings. Performance characteristics are similar
to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units.
@@ -3174,7 +3752,9 @@
-
+
+
+
Only works for FP32 varyings. Performance characteristics are similar
to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units.
@@ -3197,7 +3777,9 @@
-
+
+
+
Only works for FP32 varyings. Performance characteristics are similar
to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
@@ -3220,7 +3802,9 @@
-
+
+
+
Only works for FP32 varyings. Performance characteristics are similar
to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
@@ -3244,7 +3828,9 @@
-
+
+
+
Only works for FP32 varyings. Performance characteristics are similar
to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
@@ -3268,7 +3854,9 @@
-
+
+
+
Only works for FP32 varyings. Performance characteristics are similar
to LD_VAR_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units.
@@ -3291,7 +3879,9 @@
-
+
+
+
First calculates $A \cdot B + C$ and then biases the exponent by D. Used in
special transcendental function sequences. It should not be used for
@@ -3307,7 +3897,9 @@
-
+
+
+
First calculates $A \cdot B + C$ and then biases the exponent by D. If $A
= 0$ or $B = 0$, the multiply $A \cdot B$ is treated as zero even if an
@@ -3324,7 +3916,9 @@
-
+
+
+
First calculates $A \cdot B + C$ and then biases the exponent by D. If $A
= 0$ or $B = 0$, the multiply is treated as $A$ even if an
@@ -3341,7 +3935,9 @@
-
+
+
+
First calculates $A \cdot B + C$ and then biases the exponent by D,
interpreted as a 16-bit value. Used in special transcendental function
diff --git a/src/panfrost/compiler/bifrost/valhall/asm.py b/src/panfrost/compiler/bifrost/valhall/asm.py
index ba4127fdc6d..2e001fda929 100644
--- a/src/panfrost/compiler/bifrost/valhall/asm.py
+++ b/src/panfrost/compiler/bifrost/valhall/asm.py
@@ -315,9 +315,8 @@ def parse_asm(line):
operands = operands[len(ins.immediates):]
# Encode the operation itself
- encoded |= (ins.opcode.value << ins.opcode.start)
- if ins.opcode2:
- encoded |= (ins.opcode2.value << ins.opcode2.start)
+ for subcode in ins.opcode:
+ encoded |= (subcode.value << subcode.start)
# Encode FAU page
if fau.page:
diff --git a/src/panfrost/compiler/bifrost/valhall/disasm.py b/src/panfrost/compiler/bifrost/valhall/disasm.py
index 030423ef013..a627dd8ac5c 100644
--- a/src/panfrost/compiler/bifrost/valhall/disasm.py
+++ b/src/panfrost/compiler/bifrost/valhall/disasm.py
@@ -194,39 +194,42 @@ va_print_dest(FILE *fp, uint8_t dest, bool can_mask)
% endfor
%def>
+<%def name="recurse_subcodes(op_bucket)">
+%if op_bucket.instr:
+${print_instr(op_bucket.instr)}
+%else:
+ opcode = (instr >> ${op_bucket.start}) & ${hex(op_bucket.mask)};
+ switch (opcode) {
+%for op in op_bucket.children:
+ case ${hex(op)}:
+ {
+${recurse_subcodes(op_bucket.children[op])}
+ break;
+ }
+%endfor
+ }
+%endif
+%def>
+
+
void
va_disasm_instr(FILE *fp, uint64_t instr)
{
- unsigned primary_opc = (instr >> 48) & MASK(9);
+ unsigned opcode;
unsigned fau_page = (instr >> 57) & MASK(2);
- unsigned secondary_opc = 0;
- switch (primary_opc) {
-% for bucket in OPCODES:
- <%
- ops = OPCODES[bucket]
- ambiguous = (len(ops) > 1)
- %>
-% if len(ops) > 0:
- case ${hex(bucket)}:
-% if ambiguous:
- secondary_opc = (instr >> ${ops[0].opcode2.start}) & ${hex(ops[0].opcode2.mask)};
-% endif
-% for op in ops:
-% if ambiguous:
+${recurse_subcodes(OPCODES)}
+}
- if (secondary_opc == ${op.opcode2.value}) {
-% endif
-${print_instr(op)}
-% if ambiguous:
- }
-% endif
-% endfor
- break;
-
-% endif
-% endfor
- }
+static bool is_branch(uint64_t instr)
+{
+<% (exact, mask) = OPCODES.get_exact_mask("BRANCHZ") %>
+ if ((instr & ${hex(mask)}) == ${hex(exact)})
+ return true;
+<% (exact, mask) = OPCODES.get_exact_mask("BRANCHZI") %>
+ if ((instr & ${hex(mask)}) == ${hex(exact)})
+ return true;
+ return false;
}
void
@@ -259,13 +262,8 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose)
va_disasm_instr(fp, instr);
fprintf(fp, "\\n");
- /* Detect branches */
- uint64_t opcode = (instr >> 48) & MASK(9);
- bool branchz = (opcode == 0x1F);
- bool branchzi = (opcode == 0x2F);
-
/* Separate blocks visually by inserting whitespace after branches */
- if (branchz || branchzi)
+ if (is_branch(instr))
fprintf(fp, "\\n");
}
@@ -273,30 +271,47 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose)
}
"""
-# Bucket by opcode for hierarchical disassembly
-OPCODE_BUCKETS = {}
+class OpBucket:
+ def __init__(self):
+ self.start = None
+ self.mask = None
+ self.instr = None
+ self.children = {}
+
+ def insert(self, subcodes, ins):
+ if len(subcodes) == 0:
+ self.instr = ins
+ else:
+ sc = subcodes[0]
+ assert(self.start is None or self.start == sc.start)
+ assert(self.mask is None or self.mask == sc.mask)
+ self.start = sc.start
+ self.mask = sc.mask
+ if sc.value not in self.children:
+ self.children[sc.value] = OpBucket()
+ self.children[sc.value].insert(subcodes[1:], ins)
+
+ def get_exact_mask(self, op_name, exact = 0, mask = 0):
+ if self.instr:
+ if self.instr.name == op_name:
+ return (exact, mask)
+ else:
+ return ()
+ else:
+ for op in self.children:
+ exact_mask = self.children[op].get_exact_mask(op_name,
+ exact | (op << self.start),
+ mask | (self.mask << self.start))
+ if exact_mask:
+ return exact_mask
+ return ()
+
+# Build opcode hierarchy:
+OPCODES = OpBucket()
for ins in instructions:
- opc = ins.opcode.value
- OPCODE_BUCKETS[opc] = OPCODE_BUCKETS.get(opc, []) + [ins]
-
-# Check that each bucket may be disambiguated
-for op in OPCODE_BUCKETS:
- bucket = OPCODE_BUCKETS[op]
-
- # Nothing to disambiguate
- if len(bucket) < 2:
- continue
-
- SECONDARY = {}
- for ins in bucket:
- # Number of sources determines opcode2 placement, must be consistent
- assert(len(ins.srcs) == len(bucket[0].srcs))
-
- # Must not repeat, else we're ambiguous
- assert(ins.opcode2.value not in SECONDARY)
- SECONDARY[ins.opcode2.value] = ins
+ OPCODES.insert(ins.opcode, ins)
try:
- print(Template(template).render(OPCODES = OPCODE_BUCKETS, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name))
+ print(Template(template).render(OPCODES = OPCODES, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name))
except:
print(exceptions.text_error_template().render())
diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.c.py b/src/panfrost/compiler/bifrost/valhall/valhall.c.py
index 91d4f7c0c07..3645092b836 100644
--- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py
+++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py
@@ -147,9 +147,9 @@ valhall_opcodes[BI_NUM_OPCODES] = {
# Exact value to be ORed in to every opcode
def exact_op(op):
- exact_op = (op.opcode.value << op.opcode.start)
- if op.opcode2:
- exact_op |= (op.opcode2.value << op.opcode2.start)
+ exact_op = 0
+ for subcode in op.opcode:
+ exact_op |= (subcode.value << subcode.start)
return exact_op
try:
diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.py b/src/panfrost/compiler/bifrost/valhall/valhall.py
index cddc277d2aa..366fc3c240c 100644
--- a/src/panfrost/compiler/bifrost/valhall/valhall.py
+++ b/src/panfrost/compiler/bifrost/valhall/valhall.py
@@ -164,12 +164,11 @@ class Opcode:
self.mask = mask
class Instruction:
- def __init__(self, name, opcode, opcode2, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None):
+ def __init__(self, name, opcode, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None):
self.name = name
self.srcs = srcs
self.dests = dests
self.opcode = opcode
- self.opcode2 = opcode2
self.immediates = immediates
self.modifiers = modifiers
self.staging = staging
@@ -180,7 +179,6 @@ class Instruction:
self.message = unit not in ["FMA", "CVT", "SFU"]
assert(len(dests) == 0 or not staging)
- assert(not opcode2 or (opcode2.value & opcode2.mask) == opcode2.value)
def __str__(self):
return self.name
@@ -226,20 +224,25 @@ def build_modifier(el):
return Modifier(name, start, size, implied)
def build_opcode(el, name):
+ op_arr = []
opcode = el.find(name)
if opcode is None:
return None
- value = int(opcode.get('val'), base=0)
- start = int(opcode.get('start'))
- mask = int(opcode.get('mask'), base=0)
- return Opcode(value, start, mask)
+
+ for subcode in opcode:
+ value = int(subcode.get('val'), base=0)
+ start = int(subcode.get('start'))
+ mask = int(subcode.get('mask'), base=0)
+ assert((value & mask) == value)
+ op_arr.append(Opcode(value, start, mask))
+
+ return op_arr
# Build a single instruction from XML and group based overrides
def build_instr(el, overrides = {}):
# Get overridables
name = overrides.get('name') or el.attrib.get('name')
opcode = overrides.get('opcode') or build_opcode(el, 'opcode')
- opcode2 = overrides.get('opcode2') or build_opcode(el, 'opcode2')
unit = overrides.get('unit') or el.attrib.get('unit')
# Get explicit sources/dests
@@ -279,7 +282,7 @@ def build_instr(el, overrides = {}):
elif mod.tag =='va_mod':
modifiers.append(build_modifier(mod))
- instr = Instruction(name, opcode, opcode2, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit)
+ instr = Instruction(name, opcode, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit)
instructions.append(instr)
@@ -290,7 +293,6 @@ def build_group(el):
build_instr(el, overrides = {
'name': ins.attrib['name'],
'opcode': build_opcode(ins, 'opcode'),
- 'opcode2': build_opcode(ins, 'opcode2'),
'unit': ins.attrib.get('unit'),
})