diff --git a/src/panfrost/compiler/bifrost/valhall/ISA.xml b/src/panfrost/compiler/bifrost/valhall/ISA.xml index dc8b1fa84ba..ed8dfeb0cc6 100644 --- a/src/panfrost/compiler/bifrost/valhall/ISA.xml +++ b/src/panfrost/compiler/bifrost/valhall/ISA.xml @@ -792,7 +792,9 @@ is a duplicate instruction in the Bifrost or pseudo XML files --> - + + + Do nothing. Useful at the start of a block for waiting on slots required by the first actual instruction of the block, to reconcile dependencies @@ -801,7 +803,9 @@ - + + + Branches to a specified relative offset if its source is nonzero (default) or if its source is zero (if `.eq` is set). The offset is 27-bits and @@ -824,7 +828,9 @@ - + + + Evaluates the given condition, and if it passes, discards the current fragment and terminates the thread. Only valid in a **fragment** shader. @@ -835,7 +841,9 @@ - + + + Jump to an indirectly specified (absolute or relative) address. Used to jump to blend shaders at the end of a fragment shader. @@ -848,7 +856,9 @@ - + + + General-purpose barrier. Must use slot #7. Must be paired with a `.wait` flow on the instruction. @@ -858,10 +868,14 @@ - + + + - + + + Evaluates the given condition and outputs either the true source or the @@ -876,16 +890,24 @@ - + + + - + + + - + + + - + + + Evaluates the given condition and outputs either the true source or the @@ -904,7 +926,9 @@ - + + + @@ -919,10 +943,14 @@ Interpolates a given varying from hardware buffer - + + + - + + + @@ -939,10 +967,14 @@ Interpolates a given varying from hardware buffer - + + + - + + + @@ -957,7 +989,9 @@ - + + + Interpolates a given varying from a software buffer @@ -971,7 +1005,9 @@ - + + + Interpolates a given varying from a software buffer @@ -986,7 +1022,9 @@ - + + + Fetches a given varying from a software buffer @@ -997,7 +1035,9 @@ - + + + Fetches a given varying from a software buffer @@ -1009,8 +1049,10 @@ - - + + + + Load `vecsize` components from the attribute descriptor at entry `index` of resource table `table` at index (vertex ID, instance ID), converting @@ -1028,8 +1070,10 @@ - - + + + + Load `vecsize` components from the attribute descriptor at the specified location at index (vertex ID, instance ID), converting @@ -1048,7 +1092,9 @@ - + + + Load the 64-bit global clock, either a cycle counter or the system clock. @@ -1056,8 +1102,10 @@ - - + + + + Load `vecsize` components from the texture descriptor at entry `index` of resource table `table`, converting @@ -1075,8 +1123,10 @@ - - + + + + Load `vecsize` components from the texture descriptor at the specified location at index, converting @@ -1093,8 +1143,10 @@ - - + + + + Load the effective address of an attribute specified with the given immediate index. Returns three staging register: the low/high @@ -1110,8 +1162,10 @@ - - + + + + Load the effective address of an attribute specified with the given index. Returns three staging register: the low/high @@ -1127,8 +1181,10 @@ - - + + + + Load the effective address of a texel from the image specified with the given immediate index. Returns three staging registers: the low/high @@ -1149,8 +1205,10 @@ - - + + + + Load the effective address of a texel from the image specified with the given index. Returns three staging register: the low/high @@ -1171,8 +1229,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1190,8 +1250,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1209,8 +1271,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1228,8 +1292,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1247,8 +1313,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1266,8 +1334,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1285,8 +1355,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1304,8 +1376,10 @@ - - + + + + Loads a buffer descriptor. If bits 25...31 of the mode descriptor are all-ones, load from the buffer descriptors in the table indexed by the @@ -1324,7 +1398,9 @@ - + + + Load effective address of a buffer with an offset added. @@ -1336,7 +1412,9 @@ - + + + Load effective address of a buffer with an immediate offset added. @@ -1349,8 +1427,10 @@ - - + + + + Loads from main memory @@ -1363,8 +1443,10 @@ - - + + + + Loads from main memory @@ -1377,8 +1459,10 @@ - - + + + + Loads from main memory @@ -1391,8 +1475,10 @@ - - + + + + Loads from main memory @@ -1405,8 +1491,10 @@ - - + + + + Loads from main memory @@ -1419,8 +1507,10 @@ - - + + + + Loads from main memory @@ -1433,8 +1523,10 @@ - - + + + + Loads from main memory @@ -1447,8 +1539,10 @@ - - + + + + Loads from main memory @@ -1461,32 +1555,55 @@ - Stores to main memory - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + @@ -1496,7 +1613,9 @@ - + + + Load effective address of a simple buffer with an offset added. @@ -1508,7 +1627,9 @@ - + + + Load from memory with data conversion. The address to load from is given in the first source, which must be a 64-bit register (a pair of 32-bit @@ -1526,7 +1647,9 @@ - + + + Store to memory with data conversion. The address to store to is given in the first source, which must be a 64-bit register (a pair of 32-bit @@ -1546,7 +1669,9 @@ - + + + Loads a given render target, specified in the pixel indices descriptor, at a given location and sample, and convert to the format specified in the @@ -1564,7 +1689,9 @@ - + + + Store to given render target, specified in the pixel indices descriptor, at a given location and sample, and convert to the format specified in the @@ -1581,7 +1708,9 @@ - + + + Blends a given render target. This loads the API-specified blend state for the render target from the first source. Blend descriptors are available @@ -1618,7 +1747,9 @@ - + + + Does alpha-to-coverage testing, updating the sample coverage mask. ATEST does not do an implicit discard. It should be executed before the first @@ -1632,7 +1763,9 @@ - + + + Programatically writes out depth, stencil, or both, depending on which modifiers are set. Used to implement gl_FragDepth and gl_FragStencil. @@ -1648,7 +1781,6 @@ - Performs the given data conversion. Note that floating-point rounding is handled via the same hardware and therefore shares an encoding. Round mode @@ -1657,20 +1789,32 @@ - + + + + - + + + + - + + + + - + + + + @@ -1678,141 +1822,197 @@ - Performs the given data conversion. - + + + + - + + + + - + + + + - + + + + Value to convert - Performs the given data conversion. - + + + + - + + + + Value to convert - Performs the given data conversion. - + + + + - + + + + - + + + + - + + + + Value to convert - - + + + + Converts up with the specified round mode. Value to convert - Performs the given data conversion. - + + + + - + + + + - + + + + - + + + + Value to convert - Performs the given data conversion. - + + + + - + + + + - + + + + - + + + + Value to convert - Performs the given rounding, using the convert unit. - + + + + - + + + + @@ -1820,15 +2020,19 @@ - - + + + + Canonical register-to-register move. - - + + + + Used as a primitive for various bitwise operations. @@ -1836,8 +2040,10 @@ - - + + + + Used as a primitive for various bitwise operations. @@ -1845,8 +2051,10 @@ - - + + + + Used as a primitive for various bitwise operations. @@ -1854,8 +2062,10 @@ - - + + + + 64-bit abs may be constructed in 4 instructions (5 clocks) by checking the sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with @@ -1865,21 +2075,27 @@ - - + + + + - - + + + + - - + + + + Only available as 32-bit. Smaller bitsizes require explicit conversions. 64-bit popcount may be constructed in 3 clocks by separate 32-bit @@ -1890,8 +2106,10 @@ - - + + + + Only available as 32-bit. Other bitsizes may be derived with swizzles. @@ -1899,8 +2117,10 @@ - - + + + + For fully featured bitwise operation, see the shift opcodes. @@ -1908,8 +2128,10 @@ - - + + + + For fully featured bitwise operation, see the shift opcodes. @@ -1917,7 +2139,9 @@ - + + + Returns the mask of lanes ever active within the warp (subgroup), such that the source is nonzero. The number of work-items in a subgroup is @@ -1934,12 +2158,17 @@ - - + + + + - + + + + Flush special float values. The ftz modifier flushes subnormal values to @@ -1954,20 +2183,31 @@ - - + + + + - + + + + - + + + + - + + + + Breaks up the floating-point input into its fractional (mantissa) and @@ -1982,36 +2222,65 @@ - - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + Performs a given special function. The floating-point reciprocal (`FRCP`) @@ -2025,18 +2294,29 @@ - - + + + + - + + + + - + + + + - + + + + Performs a given special function. The trigonometric tables @@ -2047,12 +2327,17 @@ - - + + + + - + + + + $A + B$ @@ -2063,12 +2348,17 @@ - - + + + + - + + + + $\min \{ A, B \}$ @@ -2077,12 +2367,17 @@ - - + + + + - + + + + $\max \{ A, B \}$ @@ -2092,9 +2387,11 @@ - - + + + + Given a pair of 32-bit floats, output a pair of 16-bit floats packed into @@ -2107,12 +2404,17 @@ - - + + + + - + + + + Computes $A \cdot 2^B$ by adding B to the exponent of A. Used to calculate @@ -2127,8 +2429,10 @@ - - + + + + Calculates the base-2 exponent of an argument specified as a 8:24 fixed-point. The original argument is passed as well for correct handling @@ -2140,8 +2444,10 @@ - - + + + + Performs a floating-point addition specialized for logarithm computation. @@ -2151,8 +2457,10 @@ - - + + + + Used for `atan2()` implementation. Destination is two 16-bit values (int and float) for the first form, and a single 32-bit float when @@ -2164,7 +2472,6 @@ - $A + B$ with optional saturation. @@ -2172,30 +2479,54 @@ canonical lowering for swizzles. - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + A B @@ -2203,40 +2534,65 @@ - - + + + + Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)` A B - - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + $A - B$ with optional saturation A @@ -2245,7 +2601,6 @@ - Similar to SHADDX, but especially used for loading offsets into WLS. Usually this is only required for atomic operations, which cannot @@ -2254,7 +2609,10 @@ .neg indicates SEG_SUB instead. - + + + + @@ -2263,17 +2621,22 @@ - Sign or zero extend B to 64-bits, left-shift by `shift`, and add the 64-bit value A. These instructions accelerate address arithmetic, but may be used in full generality for 64-bit integer arithmetic. - + + + + - + + + + A @@ -2281,27 +2644,47 @@ - - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + $A \cdot B$ with optional saturation. Note the multipliers can only handle up to @@ -2317,24 +2700,41 @@ - - + + + + - + + + + - + + + + - + + + + - + + + + - + + + + A @@ -2347,8 +2747,10 @@ - - + + + + Selects the value of A in the subgroup lane given by B. This implements subgroup broadcasts. It may be used as a primitive for screen space @@ -2363,10 +2765,14 @@ - + + + - + + + $A \cdot B + C$ @@ -2377,18 +2783,29 @@ - - + + + + - + + + + - + + + + - + + + + Left shifts its first source by a specified amount and bitwise ANDs it with the @@ -2401,18 +2818,29 @@ - - + + + + - + + + + - + + + + - + + + + Right shifts its first source by a specified amount and bitwise ANDs it with the @@ -2428,18 +2856,29 @@ - - + + + + - + + + + - + + + + - + + + + Left shifts its first source by a specified amount and bitwise ORs it with the @@ -2452,18 +2891,29 @@ - - + + + + - + + + + - + + + + - + + + + Right shifts its first source by a specified amount and bitwise ORs it with the @@ -2479,18 +2929,29 @@ - - + + + + - + + + + - + + + + - + + + + Left shifts its first source by a specified amount and bitwise XORs it with the @@ -2503,18 +2964,29 @@ - - + + + + - + + + + - + + + + - + + + + Right shifts its first source by a specified amount and bitwise XORs it with the @@ -2530,7 +3002,9 @@ - + + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -2545,7 +3019,9 @@ - + + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -2560,7 +3036,9 @@ - + + + Mux between A and B based on the provided mask. The condition specified as the `mux` modifier is evaluated on the mask. If true, `A` is chosen, @@ -2575,8 +3053,10 @@ - - + + + + During a cube map transform, select the S coordinate given a selected face. Z coordinate as 32-bit floating point X coordinate as 32-bit floating point @@ -2584,8 +3064,10 @@ - - + + + + During a cube map transform, select the T coordinate given a selected face. Y coordinate as 32-bit floating point Z coordinate as 32-bit floating point @@ -2593,7 +3075,9 @@ - + + + Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD. @@ -2609,7 +3093,9 @@ - + + + Select the maximum absolute value of its arguments. X coordinate as 32-bit floating point Y coordinate as 32-bit floating point @@ -2617,7 +3103,9 @@ - + + + Select the cube face index corresponding to the arguments. X coordinate as 32-bit floating point Y coordinate as 32-bit floating point @@ -2625,7 +3113,6 @@ - 8-bit integer dot product between 4 channel vectors, intended for machine learning. Available in both unsigned and signed variants, controlling @@ -2638,10 +3125,16 @@ saturates. - + + + + - + + + + A B @@ -2650,7 +3143,6 @@ - Evaluates the given condition, do a logical or with the condition in the result source, and return in the given result type (integer @@ -2659,14 +3151,23 @@ when this is not desired, tie it to zero. - + + + + - + + + + - + + + + @@ -2676,7 +3177,6 @@ - Evaluates the given condition, do a logical and with the condition in the result source, and return in the given result type (integer @@ -2684,14 +3184,23 @@ for chaining together conditions without intermediate bitwise arithmetic. - + + + + - + + + + - + + + + @@ -2701,7 +3210,6 @@ - Evaluates the given condition, do a logical or with the condition in the result source, and return in the given result type (integer @@ -2710,10 +3218,16 @@ when this is not desired, tie it to zero. - + + + + - + + + + @@ -2723,7 +3237,6 @@ - Evaluates the given condition, do a logical and/or with the condition in the result source, and return in the given result type (integer @@ -2731,10 +3244,16 @@ for chaining together conditions without intermediate bitwise arithmetic. - + + + + - + + + + @@ -2744,7 +3263,6 @@ - Evaluates the given condition, do a logical or with the condition in the result source, and return in the given result type (integer @@ -2752,14 +3270,23 @@ for chaining together conditions without intermediate bitwise arithmetic. - + + + + - + + + + - + + + + @@ -2769,7 +3296,6 @@ - Evaluates the given condition, do a logical and with the condition in the result source, and return in the given result type (integer @@ -2777,14 +3303,23 @@ for chaining together conditions without intermediate bitwise arithmetic. - + + + + - + + + + - + + + + @@ -2794,7 +3329,6 @@ - Evaluates the given condition, do a logical and/or with the condition in the result source, and return in the given result type (integer @@ -2809,10 +3343,16 @@ the result of the low half comparison passed as the third source. - + + + + - + + + + @@ -2822,7 +3362,9 @@ - + + + Adds an arbitrary 32-bit immediate embedded within the instruction stream. If no modifiers are required, this is preferred to `IADD.i32` with a @@ -2836,7 +3378,9 @@ - + + + Adds an arbitrary pair of 16-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -2850,7 +3394,9 @@ - + + + Adds an arbitrary quad of 8-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -2863,7 +3409,9 @@ - + + + Adds an arbitrary 32-bit immediate embedded within the instruction stream. If no modifiers are required, this is preferred to `FADD.f32` with a @@ -2875,7 +3423,9 @@ - + + + Adds an arbitrary pair of 16-bit immediates embedded within the instruction stream. If no modifiers are required, this is preferred to @@ -2888,8 +3438,10 @@ - - + + + + @@ -2901,8 +3453,10 @@ - - + + + + @@ -2914,8 +3468,10 @@ - - + + + + @@ -2926,8 +3482,10 @@ - - + + + + @@ -2938,8 +3496,10 @@ - - + + + + @@ -2956,8 +3516,10 @@ - - + + + + @@ -2974,7 +3536,9 @@ - + + + Unfiltered textured instruction. @@ -2999,7 +3563,9 @@ - + + + Ordinary texturing instruction using a sampler. @@ -3026,7 +3592,9 @@ - + + + Texture gather instruction. @@ -3054,7 +3622,9 @@ - + + + Texture sample with explicit gradient. @@ -3079,7 +3649,9 @@ - + + + Pair of texture instructions. @@ -3103,7 +3675,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3126,7 +3700,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3150,7 +3726,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3174,7 +3752,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_BUF_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units. @@ -3197,7 +3777,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3220,7 +3802,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3244,7 +3828,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units. @@ -3268,7 +3854,9 @@ - + + + Only works for FP32 varyings. Performance characteristics are similar to LD_VAR_IMM_F32.v2.f32 followed by TEX_DUAL, using both V and T units. @@ -3291,7 +3879,9 @@ - + + + First calculates $A \cdot B + C$ and then biases the exponent by D. Used in special transcendental function sequences. It should not be used for @@ -3307,7 +3897,9 @@ - + + + First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply $A \cdot B$ is treated as zero even if an @@ -3324,7 +3916,9 @@ - + + + First calculates $A \cdot B + C$ and then biases the exponent by D. If $A = 0$ or $B = 0$, the multiply is treated as $A$ even if an @@ -3341,7 +3935,9 @@ - + + + First calculates $A \cdot B + C$ and then biases the exponent by D, interpreted as a 16-bit value. Used in special transcendental function diff --git a/src/panfrost/compiler/bifrost/valhall/asm.py b/src/panfrost/compiler/bifrost/valhall/asm.py index ba4127fdc6d..2e001fda929 100644 --- a/src/panfrost/compiler/bifrost/valhall/asm.py +++ b/src/panfrost/compiler/bifrost/valhall/asm.py @@ -315,9 +315,8 @@ def parse_asm(line): operands = operands[len(ins.immediates):] # Encode the operation itself - encoded |= (ins.opcode.value << ins.opcode.start) - if ins.opcode2: - encoded |= (ins.opcode2.value << ins.opcode2.start) + for subcode in ins.opcode: + encoded |= (subcode.value << subcode.start) # Encode FAU page if fau.page: diff --git a/src/panfrost/compiler/bifrost/valhall/disasm.py b/src/panfrost/compiler/bifrost/valhall/disasm.py index 030423ef013..a627dd8ac5c 100644 --- a/src/panfrost/compiler/bifrost/valhall/disasm.py +++ b/src/panfrost/compiler/bifrost/valhall/disasm.py @@ -194,39 +194,42 @@ va_print_dest(FILE *fp, uint8_t dest, bool can_mask) % endfor +<%def name="recurse_subcodes(op_bucket)"> +%if op_bucket.instr: +${print_instr(op_bucket.instr)} +%else: + opcode = (instr >> ${op_bucket.start}) & ${hex(op_bucket.mask)}; + switch (opcode) { +%for op in op_bucket.children: + case ${hex(op)}: + { +${recurse_subcodes(op_bucket.children[op])} + break; + } +%endfor + } +%endif + + + void va_disasm_instr(FILE *fp, uint64_t instr) { - unsigned primary_opc = (instr >> 48) & MASK(9); + unsigned opcode; unsigned fau_page = (instr >> 57) & MASK(2); - unsigned secondary_opc = 0; - switch (primary_opc) { -% for bucket in OPCODES: - <% - ops = OPCODES[bucket] - ambiguous = (len(ops) > 1) - %> -% if len(ops) > 0: - case ${hex(bucket)}: -% if ambiguous: - secondary_opc = (instr >> ${ops[0].opcode2.start}) & ${hex(ops[0].opcode2.mask)}; -% endif -% for op in ops: -% if ambiguous: +${recurse_subcodes(OPCODES)} +} - if (secondary_opc == ${op.opcode2.value}) { -% endif -${print_instr(op)} -% if ambiguous: - } -% endif -% endfor - break; - -% endif -% endfor - } +static bool is_branch(uint64_t instr) +{ +<% (exact, mask) = OPCODES.get_exact_mask("BRANCHZ") %> + if ((instr & ${hex(mask)}) == ${hex(exact)}) + return true; +<% (exact, mask) = OPCODES.get_exact_mask("BRANCHZI") %> + if ((instr & ${hex(mask)}) == ${hex(exact)}) + return true; + return false; } void @@ -259,13 +262,8 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose) va_disasm_instr(fp, instr); fprintf(fp, "\\n"); - /* Detect branches */ - uint64_t opcode = (instr >> 48) & MASK(9); - bool branchz = (opcode == 0x1F); - bool branchzi = (opcode == 0x2F); - /* Separate blocks visually by inserting whitespace after branches */ - if (branchz || branchzi) + if (is_branch(instr)) fprintf(fp, "\\n"); } @@ -273,30 +271,47 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose) } """ -# Bucket by opcode for hierarchical disassembly -OPCODE_BUCKETS = {} +class OpBucket: + def __init__(self): + self.start = None + self.mask = None + self.instr = None + self.children = {} + + def insert(self, subcodes, ins): + if len(subcodes) == 0: + self.instr = ins + else: + sc = subcodes[0] + assert(self.start is None or self.start == sc.start) + assert(self.mask is None or self.mask == sc.mask) + self.start = sc.start + self.mask = sc.mask + if sc.value not in self.children: + self.children[sc.value] = OpBucket() + self.children[sc.value].insert(subcodes[1:], ins) + + def get_exact_mask(self, op_name, exact = 0, mask = 0): + if self.instr: + if self.instr.name == op_name: + return (exact, mask) + else: + return () + else: + for op in self.children: + exact_mask = self.children[op].get_exact_mask(op_name, + exact | (op << self.start), + mask | (self.mask << self.start)) + if exact_mask: + return exact_mask + return () + +# Build opcode hierarchy: +OPCODES = OpBucket() for ins in instructions: - opc = ins.opcode.value - OPCODE_BUCKETS[opc] = OPCODE_BUCKETS.get(opc, []) + [ins] - -# Check that each bucket may be disambiguated -for op in OPCODE_BUCKETS: - bucket = OPCODE_BUCKETS[op] - - # Nothing to disambiguate - if len(bucket) < 2: - continue - - SECONDARY = {} - for ins in bucket: - # Number of sources determines opcode2 placement, must be consistent - assert(len(ins.srcs) == len(bucket[0].srcs)) - - # Must not repeat, else we're ambiguous - assert(ins.opcode2.value not in SECONDARY) - SECONDARY[ins.opcode2.value] = ins + OPCODES.insert(ins.opcode, ins) try: - print(Template(template).render(OPCODES = OPCODE_BUCKETS, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name)) + print(Template(template).render(OPCODES = OPCODES, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name)) except: print(exceptions.text_error_template().render()) diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.c.py b/src/panfrost/compiler/bifrost/valhall/valhall.c.py index 91d4f7c0c07..3645092b836 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.c.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.c.py @@ -147,9 +147,9 @@ valhall_opcodes[BI_NUM_OPCODES] = { # Exact value to be ORed in to every opcode def exact_op(op): - exact_op = (op.opcode.value << op.opcode.start) - if op.opcode2: - exact_op |= (op.opcode2.value << op.opcode2.start) + exact_op = 0 + for subcode in op.opcode: + exact_op |= (subcode.value << subcode.start) return exact_op try: diff --git a/src/panfrost/compiler/bifrost/valhall/valhall.py b/src/panfrost/compiler/bifrost/valhall/valhall.py index cddc277d2aa..366fc3c240c 100644 --- a/src/panfrost/compiler/bifrost/valhall/valhall.py +++ b/src/panfrost/compiler/bifrost/valhall/valhall.py @@ -164,12 +164,11 @@ class Opcode: self.mask = mask class Instruction: - def __init__(self, name, opcode, opcode2, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None): + def __init__(self, name, opcode, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None): self.name = name self.srcs = srcs self.dests = dests self.opcode = opcode - self.opcode2 = opcode2 self.immediates = immediates self.modifiers = modifiers self.staging = staging @@ -180,7 +179,6 @@ class Instruction: self.message = unit not in ["FMA", "CVT", "SFU"] assert(len(dests) == 0 or not staging) - assert(not opcode2 or (opcode2.value & opcode2.mask) == opcode2.value) def __str__(self): return self.name @@ -226,20 +224,25 @@ def build_modifier(el): return Modifier(name, start, size, implied) def build_opcode(el, name): + op_arr = [] opcode = el.find(name) if opcode is None: return None - value = int(opcode.get('val'), base=0) - start = int(opcode.get('start')) - mask = int(opcode.get('mask'), base=0) - return Opcode(value, start, mask) + + for subcode in opcode: + value = int(subcode.get('val'), base=0) + start = int(subcode.get('start')) + mask = int(subcode.get('mask'), base=0) + assert((value & mask) == value) + op_arr.append(Opcode(value, start, mask)) + + return op_arr # Build a single instruction from XML and group based overrides def build_instr(el, overrides = {}): # Get overridables name = overrides.get('name') or el.attrib.get('name') opcode = overrides.get('opcode') or build_opcode(el, 'opcode') - opcode2 = overrides.get('opcode2') or build_opcode(el, 'opcode2') unit = overrides.get('unit') or el.attrib.get('unit') # Get explicit sources/dests @@ -279,7 +282,7 @@ def build_instr(el, overrides = {}): elif mod.tag =='va_mod': modifiers.append(build_modifier(mod)) - instr = Instruction(name, opcode, opcode2, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit) + instr = Instruction(name, opcode, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit) instructions.append(instr) @@ -290,7 +293,6 @@ def build_group(el): build_instr(el, overrides = { 'name': ins.attrib['name'], 'opcode': build_opcode(ins, 'opcode'), - 'opcode2': build_opcode(ins, 'opcode2'), 'unit': ins.attrib.get('unit'), })