pan/bi: Annotate Valhall instructions with units

Based on analyzing the cycle counts reported by the Mali offline compiler. Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13802>
2026-05-09 04:38:03 +02:00 · 2021-11-15 18:18:23 -05:00 · 2021-11-15 18:18:23 -05:00 · 855ab23d9a
commit 855ab23d9a
parent 04cc1b93b1
1 changed files with 97 additions and 91 deletions
--- a/src/panfrost/bifrost/valhall/ISA.xml
+++ b/src/panfrost/bifrost/valhall/ISA.xml
@ -576,7 +576,7 @@
    <value name="0x7C007C00">v2inf</value>
  </enum>

-  <ins name="NOP" title="No operation" dests="0" opcode="0x00">
+  <ins name="NOP" title="No operation" dests="0" opcode="0x00" unit="CVT">
    <desc>
      Do nothing. Useful at the start of a block for waiting on slots required
      by the first actual instruction of the block, to reconcile dependencies
@ -584,7 +584,7 @@
    </desc>
  </ins>

-  <ins name="BRANCHZ" title="Compare to zero and branch" dests="0" opcode="0x1F">
+  <ins name="BRANCHZ" title="Compare to zero and branch" dests="0" opcode="0x1F" unit="CVT">
    <desc>
      Branches to a specified relative offset if its source is nonzero (default)
      or if its source is zero (if `.eq` is set). The offset is 27-bits and
@ -605,7 +605,7 @@
    <mod name="eq" start="36" size="1"/>
  </ins>

-  <ins name="DISCARD.f32" title="Discard fragment" opcode="0x20">
+  <ins name="DISCARD.f32" title="Discard fragment" opcode="0x20" unit="CVT">
    <desc>
      Evaluates the given condition, and if it passes, discards the current
      fragment and terminates the thread. The destination should be set to R60.
@ -617,7 +617,7 @@
    <src absneg="true" swizzle="true">Right value to compare</src>
  </ins>

-  <ins name="BRANCHZI" title="Compare to zero and branch indirect" opcode="0x2F">
+  <ins name="BRANCHZI" title="Compare to zero and branch indirect" opcode="0x2F" unit="CVT">
    <desc>
      Jump to an indirectly specified address. Used to jump to blend shaders at
      the end of a fragment shader.
@ -627,7 +627,7 @@
    <mod name="eq" start="36" size="1"/>
  </ins>

-  <ins name="BARRIER" title="Execution and memory barrier" opcode="0x45">
+  <ins name="BARRIER" title="Execution and memory barrier" opcode="0x45" unit="NONE">
    <desc>
      General-purpose barrier. Must use slot #7. Must be paired with a
      `.barrier` action on the instruction.
@ -635,7 +635,7 @@
    <slot/>
  </ins>

-  <group name="CSEL" title="Floating-point conditional select" dests="1">
+  <group name="CSEL" title="Floating-point conditional select" dests="1" unit="CVT">
    <ins name="CSEL.f32" opcode="0x154"/>
    <ins name="CSEL.v2f16" opcode="0x155"/>
    <desc>
@ -649,7 +649,7 @@
    <src float="true">Return value if false</src>
  </group>

-  <group name="CSEL" title="Integer conditional select" dests="1">
+  <group name="CSEL" title="Integer conditional select" dests="1" unit="CVT">
    <ins name="CSEL.u32" opcode="0x150"/>
    <ins name="CSEL.v2u16" opcode="0x151"/>
    <ins name="CSEL.i32" opcode="0x158"/>
@ -670,7 +670,7 @@
    <src>Return value if false</src>
  </group>

-  <ins name="LD_VAR_SPECIAL" title="Load special varying" opcode="0x56">
+  <ins name="LD_VAR_SPECIAL" title="Load special varying" opcode="0x56" unit="V">
    <sr write="true"/>
    <sr_count/>
    <vecsize/>
@ -680,7 +680,7 @@
    <imm name="index" start="12" size="4"/> <!-- 0 for pointx, 1 for pointy, 2 for fragw, 3 for fragz -->
  </ins>

-  <group name="LD_VAR_IMM_F32" title="Load immediate varying">
+  <group name="LD_VAR_IMM_F32" title="Load immediate varying" unit="V">
    <desc>Interpolates a given varying</desc>
    <ins name="LD_VAR_IMM_F32" opcode="0x5C"/>
    <ins name="LD_VAR_IMM_F16" opcode="0x5D"/>
@ -694,7 +694,7 @@
    <imm name="index" start="20" size="4"/>
  </group>

-  <ins name="LD_ATTR_IMM" title="Load immediate attribute" opcode="0x66">
+  <ins name="LD_ATTR_IMM" title="Load immediate attribute" opcode="0x66" unit="LS">
    <sr_count/>
    <vecsize/>
    <regfmt/>
@ -705,7 +705,7 @@
    <imm name="index" start="20" size="4"/>
  </ins>

-  <ins name="LD_ATTR" title="Load indirect attribute" opcode="0x67">
+  <ins name="LD_ATTR" title="Load indirect attribute" opcode="0x67" unit="LS">
    <desc>The index must not diverge within a warp.</desc>
    <vecsize/>
    <regfmt/>
@ -717,7 +717,7 @@
    <src>Index</src>
  </ins>

-  <ins name="LEA_ATTR" title="Load effective address" opcode="0x5E">
+  <ins name="LEA_ATTR" title="Load effective address" opcode="0x5E" unit="LS">
    <desc>
      Loads the effective address of the position buffer (in a position shader)
      or the varying buffer (in a varying shader). That is, the base pointer
@ -736,7 +736,7 @@
    <src>Linear ID</src>
  </ins>

-  <ins name="LOAD.i8" title="Global memory load" opcode="0x60" opcode2="0">
+  <ins name="LOAD.i8" title="Global memory load" opcode="0x60" opcode2="0" unit="LS">
    <desc>Loads from main memory</desc>
    <sr write="true"/>
    <sr_count/>
@ -747,7 +747,7 @@
    <imm name="offset" start="8" size="16" signed="true"/>
  </ins>

-  <ins name="LOAD.i16" title="Global memory load" opcode="0x60" opcode2="1">
+  <ins name="LOAD.i16" title="Global memory load" opcode="0x60" opcode2="1" unit="LS">
    <desc>Loads from main memory</desc>
    <sr write="true"/>
    <sr_count/>
@ -758,7 +758,7 @@
    <imm name="offset" start="8" size="16" signed="true"/>
  </ins>

-  <ins name="LOAD.i24" title="Global memory load" opcode="0x60" opcode2="2">
+  <ins name="LOAD.i24" title="Global memory load" opcode="0x60" opcode2="2" unit="LS">
    <desc>Loads from main memory</desc>
    <sr write="true"/>
    <sr_count/>
@ -769,7 +769,7 @@
    <imm name="offset" start="8" size="16" signed="true"/>
  </ins>

-  <ins name="LOAD.i32" title="Global memory load" opcode="0x60" opcode2="3">
+  <ins name="LOAD.i32" title="Global memory load" opcode="0x60" opcode2="3" unit="LS">
    <desc>Loads from main memory</desc>
    <sr write="true"/>
    <sr_count/>
@ -780,7 +780,7 @@
    <imm name="offset" start="8" size="16" signed="true"/>
  </ins>

-  <ins name="LOAD.i48" title="Global memory load" opcode="0x60" opcode2="4">
+  <ins name="LOAD.i48" title="Global memory load" opcode="0x60" opcode2="4" unit="LS">
    <desc>Loads from main memory</desc>
    <sr write="true"/>
    <sr_count/>
@ -791,7 +791,7 @@
    <imm name="offset" start="8" size="16" signed="true"/>
  </ins>

-  <ins name="LOAD.i64" title="Global memory load" opcode="0x60" opcode2="5">
+  <ins name="LOAD.i64" title="Global memory load" opcode="0x60" opcode2="5" unit="LS">
    <desc>Loads from main memory</desc>
    <sr write="true"/>
    <sr_count/>
@ -802,7 +802,7 @@
    <imm name="offset" start="8" size="16" signed="true"/>
  </ins>

-  <ins name="LOAD.i96" title="Global memory load" opcode="0x60" opcode2="6">
+  <ins name="LOAD.i96" title="Global memory load" opcode="0x60" opcode2="6" unit="LS">
    <desc>Loads from main memory</desc>
    <sr write="true"/>
    <sr_count/>
@ -813,7 +813,7 @@
    <imm name="offset" start="8" size="16" signed="true"/>
  </ins>

-  <ins name="LOAD.i128" title="Global memory load" opcode="0x60" opcode2="7">
+  <ins name="LOAD.i128" title="Global memory load" opcode="0x60" opcode2="7" unit="LS">
    <desc>Loads from main memory</desc>
    <sr write="true"/>
    <sr_count/>
@ -824,7 +824,7 @@
    <imm name="offset" start="8" size="16" signed="true"/>
  </ins>

-  <group name="STORE" title="Global memory store" opcode="0x61">
+  <group name="STORE" title="Global memory store" opcode="0x61" unit="LS">
    <desc>Stores to main memory</desc>
    <sr read="true"/>
    <ins name="STORE.i8" opcode2="0x0"/>
@ -842,7 +842,7 @@
    <imm name="offset" start="8" size="16" signed="true"/>
  </group>

-  <ins name="ST_IMAGE" title="Image store" opcode="0x71">
+  <ins name="ST_IMAGE" title="Image store" opcode="0x71" unit="LS">
    <desc>Stores to images</desc>
    <sr read="true"/>
    <sr_count/>
@ -850,7 +850,7 @@
    <src>Address to store to after adding offset</src>
  </ins>

-  <ins name="LD_TILE" title="Load from tilebuffer" opcode="0x78">
+  <ins name="LD_TILE" title="Load from tilebuffer" opcode="0x78" unit="NONE">
    <desc>
      Loads a given render target, specified in the pixel indices descriptor, at
      a given location and sample, and convert to the format specified in the
@ -865,7 +865,7 @@
    <src>Conversion descriptor</src>
  </ins>

-  <ins name="BLEND" title="Blend render target" opcode="0x7F">
+  <ins name="BLEND" title="Blend render target" opcode="0x7F" unit="NONE">
    <desc>
      Blends a given render target. This loads the API-specified blend state for
      the render target from the first source. Blend descriptors are available
@ -901,7 +901,7 @@
    <regfmt/>
  </ins>

-  <ins name="ATEST" title="Alpha test" opcode="0x7D">
+  <ins name="ATEST" title="Alpha test" opcode="0x7D" unit="NONE">
    <desc>
      Does alpha-to-coverage testing, updating the sample coverage mask. ATEST
      does not do an implicit discard. It should be executed before the first
@ -914,7 +914,7 @@
    <sr_count/>
  </ins>

-  <ins name="ZS_EMIT" title="Depth/stencil write" opcode="0x7E">
+  <ins name="ZS_EMIT" title="Depth/stencil write" opcode="0x7E" unit="NONE">
    <desc>
      Programatically writes out depth, stencil, or both, depending on which
      modifiers are set. Used to implement gl_FragDepth and gl_FragStencil.
@ -927,7 +927,7 @@
    <src>Input coverage mask</src>
  </ins>

-  <group name="CONVERT" title="Data conversions" dests="1" opcode="0x90">
+  <group name="CONVERT" title="Data conversions" dests="1" opcode="0x90" unit="CVT">
    <desc>
      Performs the given data conversion. Note that floating-point rounding is
      handled via the same hardware and therefore shares an encoding. Round mode
@ -950,7 +950,7 @@
    <src widen="true">Value to convert</src>
  </group>

-  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90">
+  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90" unit="CVT">
    <desc>Performs the given data conversion.</desc>
    <ins name="F32_TO_S32" opcode2="0xC"/>
    <ins name="F32_TO_U32" opcode2="0x1C"/>
@ -958,7 +958,7 @@
    <src absneg="true">Value to convert</src>
  </group>

-  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90">
+  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90" unit="CVT">
    <desc>Performs the given data conversion.</desc>
    <ins name="V2F16_TO_V2S16" opcode2="0xE"/>
    <ins name="V2F16_TO_V2U16" opcode2="0x1E"/>
@ -968,13 +968,13 @@
    <src swizzle="true" absneg="true" size="16">Value to convert</src>
  </group>

-  <ins name="F16_TO_F32" title="16-bit float to 32-bit float conversion" dests="1" opcode="0x90" opcode2="0xB">
+  <ins name="F16_TO_F32" title="16-bit float to 32-bit float conversion" dests="1" opcode="0x90" opcode2="0xB" unit="CVT">
    <desc>Converts up with the specified round mode.</desc>
    <roundmode/>
    <src lane="28" size="16" absneg="true">Value to convert</src>
  </ins>

-  <group name="CONVERT" title="8-bit data conversions" dests="1" opcode="0x90">
+  <group name="CONVERT" title="8-bit data conversions" dests="1" opcode="0x90" unit="CVT">
    <desc>
      Performs the given data conversion.
    </desc>
@ -992,7 +992,7 @@
    <src lane="28" size="8">Value to convert</src>
  </group>

-  <group name="FROUND" title="Floating-point rounding" dests="1" opcode="0x90">
+  <group name="FROUND" title="Floating-point rounding" dests="1" opcode="0x90" unit="CVT">
    <desc>
      Performs the given rounding, using the convert unit.
    </desc>
@ -1004,33 +1004,33 @@
    <src swizzle="true" absneg="true">Value to convert</src>
  </group>

-  <ins name="MOV.i32" title="Register move" dests="1" opcode="0x91" opcode2="0x0">
+  <ins name="MOV.i32" title="Register move" dests="1" opcode="0x91" opcode2="0x0" unit="CVT">
    <desc>Canonical register-to-register move.</desc>
    <src/>
  </ins>

-  <ins name="CLZ.u32" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x4">
+  <ins name="CLZ.u32" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x4" unit="CVT">
    <desc>
      Used as a primitive for various bitwise operations.
    </desc>
    <src/>
  </ins>

-  <ins name="CLZ.v2u16" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x5">
+  <ins name="CLZ.v2u16" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x5" unit="CVT">
    <desc>
      Used as a primitive for various bitwise operations.
    </desc>
    <src/>
  </ins>

-  <ins name="CLZ.v4u8" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x6">
+  <ins name="CLZ.v4u8" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x6" unit="CVT">
    <desc>
      Used as a primitive for various bitwise operations.
    </desc>
    <src/>
  </ins>

-  <ins name="IABS.s32" title="Absolute value" dests="1" opcode="0x91" opcode2="0x8">
+  <ins name="IABS.s32" title="Absolute value" dests="1" opcode="0x91" opcode2="0x8" unit="CVT">
    <desc>
      64-bit abs may be constructed in 4 instructions (5 clocks) by checking the
      sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with
@ -1039,15 +1039,15 @@
    <src widen="true"/>
  </ins>

-  <ins name="IABS.v2s16" title="Absolute value" dests="1" opcode="0x91" opcode2="0x9">
+  <ins name="IABS.v2s16" title="Absolute value" dests="1" opcode="0x91" opcode2="0x9" unit="CVT">
    <src widen="true"/>
  </ins>

-  <ins name="IABS.v4s8" title="Absolute value" dests="1" opcode="0x91" opcode2="0xa">
+  <ins name="IABS.v4s8" title="Absolute value" dests="1" opcode="0x91" opcode2="0xa" unit="CVT">
    <src/>
  </ins>

-  <ins name="POPCOUNT.i32" title="Population count" dests="1" opcode="0x91" opcode2="0xC">
+  <ins name="POPCOUNT.i32" title="Population count" dests="1" opcode="0x91" opcode2="0xC" unit="SFU">
    <desc>
      Only available as 32-bit. Smaller bitsizes require explicit conversions.
      64-bit popcount may be constructed in 3 clocks by separate 32-bit
@ -1057,28 +1057,29 @@
    <src/>
  </ins>

-  <ins name="BITREV.i32" title="Bitwise reverse" dests="1" opcode="0x91" opcode2="0xD">
+  <ins name="BITREV.i32" title="Bitwise reverse" dests="1" opcode="0x91" opcode2="0xD" unit="SFU">
    <desc>
      Only available as 32-bit. Other bitsizes may be derived with swizzles.
    </desc>
    <src/>
  </ins>

-  <ins name="NOT.i32" title="Bitwise complement" dests="1" opcode="0x91" opcode2="0xE">
+  <ins name="NOT.i32" title="Bitwise complement" dests="1" opcode="0x91" opcode2="0xE" unit="SFU">
    <desc>
      For fully featured bitwise operation, see the shift opcodes.
    </desc>
    <src/>
  </ins>

-  <ins name="NOT.i64" title="Bitwise complement" dests="1" opcode="0x191" opcode2="0xE">
+  <ins name="NOT.i64" title="Bitwise complement" dests="1" opcode="0x191" opcode2="0xE" unit="SFU">
    <desc>
      For fully featured bitwise operation, see the shift opcodes.
    </desc>
    <src/>
  </ins>

-  <ins name="WMASK" title="Warp mask" dests="1" opcode="0x95">
+  <ins name="WMASK" title="Warp mask" dests="1" opcode="0x95" unit="SFU">
+    <!-- TODO: confirm unit -->
    <desc>
      Returns the mask of lanes ever active within the warp (subgroup), such
      that the source is nonzero. The number of work-items in a subgroup is
@ -1094,7 +1095,7 @@
    <subgroup/>
  </ins>

-  <group name="FREXP" title="Fraction/exponent extract" dests="1" opcode="0x99">
+  <group name="FREXP" title="Fraction/exponent extract" dests="1" opcode="0x99" unit="CVT">
    <ins name="FREXPM.f32" opcode2="0"/>
    <ins name="FREXPM.v2f16" opcode2="1"/>
    <ins name="FREXPE.f32" opcode2="2"/>
@ -1109,7 +1110,7 @@
    <src float="true" swizzle="true"/>
  </group>

-  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C">
+  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C" unit="SFU">
    <ins name="FRCP.f32" opcode2="0"/>
    <ins name="FRCP.f16" opcode2="1"/>
    <ins name="FRSQ.f32" opcode2="2"/>
@ -1121,10 +1122,10 @@
      The logarithm instruction (`FLOGD.f32`) requires an argument reduction. See the
      transcendentals section for more information.
    </desc>
-    <src float="true" swizzle="true"/>
+    <src float="true" swizzle="true" absneg="true"/>
  </group>

-  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C">
+  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C" unit="SFU">
    <ins name="FSIN_TABLE.u6" opcode2="4"/>
    <ins name="FCOS_TABLE.u6" opcode2="5"/>
    <desc>
@ -1134,7 +1135,7 @@
    <src/>
  </group>

-  <group name="FADD" title="Floating-point add" dests="1" opcode2="0">
+  <group name="FADD" title="Floating-point add" dests="1" opcode2="0" unit="FMA">
    <ins name="FADD.f32" opcode="0xA4"/>
    <ins name="FADD.v2f16" opcode="0xA5"/>
    <desc>$A + B$</desc>
@ -1143,7 +1144,7 @@
    <src absneg="true" swizzle="true">B</src>
  </group>

-  <group name="FMIN" title="Floating-point minimum" dests="1" opcode2="2">
+  <group name="FMIN" title="Floating-point minimum" dests="1" opcode2="2" unit="CVT">
    <ins name="FMIN.f32" opcode="0xA4"/>
    <ins name="FMIN.v2f16" opcode="0xA5"/>
    <desc>$\min \{ A, B \}$</desc>
@ -1152,7 +1153,7 @@
    <src absneg="true" swizzle="true">B</src>
  </group>

-  <group name="FMAX" title="Floating-point maximum" dests="1" opcode2="3">
+  <group name="FMAX" title="Floating-point maximum" dests="1" opcode2="3" unit="CVT">
    <ins name="FMAX.f32" opcode="0xA4"/>
    <ins name="FMAX.v2f16" opcode="0xA5"/>
    <desc>$\max \{ A, B \}$</desc>
@ -1161,7 +1162,7 @@
    <src absneg="true" swizzle="true">B</src>
  </group>

-  <group name="V2F32_TO_V2F16" title="Vectorized floating-point conversion" dests="1" opcode2="4">
+  <group name="V2F32_TO_V2F16" title="Vectorized floating-point conversion" dests="1" opcode2="4" unit="CVT">
    <ins name="V2F32_TO_V2F16" opcode="0xA5"/>
    <desc>
      Given a pair of 32-bit floats, output a pair of 16-bit floats packed into
@ -1171,7 +1172,7 @@
    <src>B</src>
  </group>

-  <group name="FRSCALE" title="Floating-point rescaling" dests="1" opcode2="6">
+  <group name="FRSCALE" title="Floating-point rescaling" dests="1" opcode2="6" unit="FMA">
    <ins name="FRSCALE.f32" opcode="0xA4"/>
    <ins name="FRSCALE.v2f16" opcode="0xA5"/>
    <desc>
@ -1185,7 +1186,7 @@
    <src absneg="true" swizzle="true">B</src>
  </group>

-  <ins name="FEXP.f32" title="Floating-point exponent" dests="1" opcode="0xA4" opcode2="8">
+  <ins name="FEXP.f32" title="Floating-point exponent" dests="1" opcode="0xA4" opcode2="8" unit="SFU">
    <desc>
      Calculates the base-2 exponent of an argument specified as a 8:24
      fixed-point. The original argument is passed as well for correct handling
@ -1196,7 +1197,7 @@
    <src absneg="true">Input as 32-bit float</src>
  </ins>

-  <ins name="FADD_LSCALE.f32" title="Floating-point add with logarithm scale" dests="1" opcode="0xA4" opcode2="9">
+  <ins name="FADD_LSCALE.f32" title="Floating-point add with logarithm scale" dests="1" opcode="0xA4" opcode2="9" unit="FMA">
    <desc>
      Performs a floating-point addition specialized for logarithm computation.
    </desc>
@ -1205,7 +1206,7 @@
    <src absneg="true">B</src>
  </ins>

-  <group name="IADD" title="Integer addition" dests="1" opcode2="0">
+  <group name="IADD" title="Integer addition" dests="1" opcode2="0" unit="CVT">
    <desc>
      $A + B$ with optional saturation.

@ -1226,13 +1227,13 @@
    <saturate/>
  </group>

-  <ins name="MKVEC.v2i16" title="Make 16-bit vector" dests="1" opcode="0xA1" opcode2="0x5">
+  <ins name="MKVEC.v2i16" title="Make 16-bit vector" dests="1" opcode="0xA1" opcode2="0x5" unit="CVT">
    <desc>Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)`</desc>
    <src widen="true">A</src>
    <src widen="true">B</src>
  </ins>

-  <group name="ISUB" title="Integer subtract" dests="1" opcode2="1">
+  <group name="ISUB" title="Integer subtract" dests="1" opcode2="1" unit="CVT">
    <ins name="ISUB.u32" opcode="0xA0"/>
    <ins name="ISUB.v2u16" opcode="0xA1"/>
    <ins name="ISUB.v4u8" opcode="0xA2"/>
@ -1247,7 +1248,7 @@
    <saturate/>
  </group>

-  <group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" opcode2="7">
+  <group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" opcode2="7" unit="CVT">
    <desc>
      Sign or zero extend B to 64-bits, left-shift by `shift`, and add the
      64-bit value A. These instructions accelerate address arithmetic, but may
@ -1260,7 +1261,7 @@
    <src widen="true">B</src>
  </group>

-  <group name="IMUL" title="Integer multiply" dests="1" opcode2="0x0A">
+  <group name="IMUL" title="Integer multiply" dests="1" opcode2="0x0A" unit="SFU">
    <ins name="IMUL.i32" opcode="0xA0"/>
    <ins name="IMUL.v2i16" opcode="0xA1"/>
    <ins name="IMUL.v4i8" opcode="0xA2"/>
@ -1281,7 +1282,8 @@
    <saturate/>
  </group>

-  <group name="HADD" title="Integer half-add" dests="1" opcode2="0x0B">
+  <group name="HADD" title="Integer half-add" dests="1" opcode2="0x0B" unit="CVT">
+    <!-- TODO: confirm unit -->
    <ins name="HADD.u32" opcode="0xA0"/>
    <ins name="HADD.v2u16" opcode="0xA1"/>
    <ins name="HADD.v4u8" opcode="0xA2"/>
@ -1298,7 +1300,7 @@
    </desc>
  </group>

-  <group name="CLPER" title="Cross-lane permute" dests="1" opcode2="0xF">
+  <group name="CLPER" title="Cross-lane permute" dests="1" opcode2="0xF" unit="SFU">
    <ins name="CLPER.i32" opcode="0xA0"/>
    <ins name="CLPER.v2u16" opcode="0xA1"/>
    <ins name="CLPER.v4u8" opcode="0xA2"/>
@ -1320,7 +1322,7 @@
    <inactive_result/>
  </group>

-  <group name="FMA" title="Fused floating-point multiply add" dests="1">
+  <group name="FMA" title="Fused floating-point multiply add" dests="1" unit="FMA">
    <ins name="FMA.f32" opcode="0xB2"/>
    <ins name="FMA.v2f16" opcode="0xB3"/>
    <desc>$A \cdot B + C$</desc>
@ -1330,7 +1332,7 @@
    <src absneg="true" swizzle="true">C</src>
  </group>

-  <group name="LSHIFT_AND" title="Left shift and bitwise AND" dests="1" opcode2="0x100">
+  <group name="LSHIFT_AND" title="Left shift and bitwise AND" dests="1" opcode2="0x100" unit="SFU">
    <ins name="LSHIFT_AND.i32" opcode="0xB4"/>
    <ins name="LSHIFT_AND.v2i16" opcode="0xB5"/>
    <ins name="LSHIFT_AND.v4i8" opcode="0xB6"/>
@ -1346,7 +1348,7 @@
    <src not="true">B</src>
  </group>

-  <group name="RSHIFT_AND" title="Right shift and bitwise AND" dests="1" opcode2="0x000">
+  <group name="RSHIFT_AND" title="Right shift and bitwise AND" dests="1" opcode2="0x000" unit="SFU">
    <ins name="RSHIFT_AND.i32" opcode="0xB4"/>
    <ins name="RSHIFT_AND.v2i16" opcode="0xB5"/>
    <ins name="RSHIFT_AND.v4i8" opcode="0xB6"/>
@ -1362,7 +1364,7 @@
    <src not="true">B</src>
  </group>

-  <group name="LSHIFT_OR" title="Left shift and bitwise OR" dests="1" opcode2="0x101">
+  <group name="LSHIFT_OR" title="Left shift and bitwise OR" dests="1" opcode2="0x101" unit="SFU">
    <ins name="LSHIFT_OR.i32" opcode="0xB4"/>
    <ins name="LSHIFT_OR.v2i16" opcode="0xB5"/>
    <ins name="LSHIFT_OR.v4i8" opcode="0xB6"/>
@ -1378,7 +1380,7 @@
    <src not="true">B</src>
  </group>

-  <group name="RSHIFT_OR" title="Right shift and bitwise OR" dests="1" opcode2="0x001">
+  <group name="RSHIFT_OR" title="Right shift and bitwise OR" dests="1" opcode2="0x001" unit="SFU">
    <ins name="RSHIFT_OR.i32" opcode="0xB4"/>
    <ins name="RSHIFT_OR.v2i16" opcode="0xB5"/>
    <ins name="RSHIFT_OR.v4i8" opcode="0xB6"/>
@ -1394,7 +1396,7 @@
    <src not="true">B</src>
  </group>

-  <group name="LSHIFT_XOR" title="Left shift and bitwise XOR" dests="1" opcode2="0x102">
+  <group name="LSHIFT_XOR" title="Left shift and bitwise XOR" dests="1" opcode2="0x102" unit="SFU">
    <ins name="LSHIFT_XOR.i32" opcode="0xB4"/>
    <ins name="LSHIFT_XOR.v2i16" opcode="0xB5"/>
    <ins name="LSHIFT_XOR.v4i8" opcode="0xB6"/>
@ -1410,7 +1412,7 @@
    <src not="true">B</src>
  </group>

-  <group name="RSHIFT_XOR" title="Right shift and bitwise XOR" dests="1" opcode2="0x002">
+  <group name="RSHIFT_XOR" title="Right shift and bitwise XOR" dests="1" opcode2="0x002" unit="SFU">
    <ins name="RSHIFT_XOR.i32" opcode="0xB4"/>
    <ins name="RSHIFT_XOR.v2i16" opcode="0xB5"/>
    <ins name="RSHIFT_XOR.v4i8" opcode="0xB6"/>
@ -1426,7 +1428,7 @@
    <src not="true">B</src>
  </group>

-  <ins name="MUX.i32" title="Mux" dests="1" opcode="0xB8">
+  <ins name="MUX.i32" title="Mux" dests="1" opcode="0xB8" unit="SFU">
    <desc>
      Mux between A and B based on the provided mask. Equivalent to
      `bitselect()` in OpenCL. `(A &amp; mask) | (A &amp; ~mask)`
@ -1436,21 +1438,21 @@
    <src>Mask</src>
  </ins>

-  <ins name="CUBE_SSEL" title="Cube S-coordinate select" dests="1" opcode="0xBC" opcode2="0">
+  <ins name="CUBE_SSEL" title="Cube S-coordinate select" dests="1" opcode="0xBC" opcode2="0" unit="SFU">
    <desc>During a cube map transform, select the S coordinate given a selected face.</desc>
    <src absneg="true">Z coordinate as 32-bit floating point</src>
    <src absneg="true">X coordinate as 32-bit floating point</src>
    <src>Cube face index</src>
  </ins>

-  <ins name="CUBE_TSEL" title="Cube T-coordinate select" dests="1" opcode="0xBC" opcode2="1">
+  <ins name="CUBE_TSEL" title="Cube T-coordinate select" dests="1" opcode="0xBC" opcode2="1" unit="SFU">
    <desc>During a cube map transform, select the T coordinate given a selected face.</desc>
    <src absneg="true">Y coordinate as 32-bit floating point</src>
    <src absneg="true">Z coordinate as 32-bit floating point</src>
    <src>Cube face index</src>
  </ins>

-  <ins name="MKVEC.v4i8" title="Make 8-bit vector" dests="1" opcode="0xBD">
+  <ins name="MKVEC.v4i8" title="Make 8-bit vector" dests="1" opcode="0xBD" unit="CVT">
    <desc>
      Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD.

@ -1465,21 +1467,22 @@
    <src>CD</src>
  </ins>

-  <ins name="CUBEFACE1" title="Cube map transform step 1" dests="1" opcode="0xC0">
+  <ins name="CUBEFACE1" title="Cube map transform step 1" dests="1" opcode="0xC0" unit="SFU">
    <desc>Select the maximum absolute value of its arguments.</desc>
    <src absneg="true">X coordinate as 32-bit floating point</src>
    <src absneg="true">Y coordinate as 32-bit floating point</src>
    <src absneg="true">Z coordinate as 32-bit floating point</src>
  </ins>

-  <ins name="CUBEFACE2" title="Cube map transform step 2" dests="1" opcode="0xC1">
+  <ins name="CUBEFACE2" title="Cube map transform step 2" dests="1" opcode="0xC1" unit="SFU">
    <desc>Select the cube face index corresponding to the arguments.</desc>
    <src absneg="true">X coordinate as 32-bit floating point</src>
    <src absneg="true">Y coordinate as 32-bit floating point</src>
    <src absneg="true">Z coordinate as 32-bit floating point</src>
  </ins>

-  <group name="IDP" title="8-bit dot product" dests="1" opcode="0xC2">
+  <group name="IDP" title="8-bit dot product" dests="1" opcode="0xC2" unit="SFU">
+    <!-- TODO: confirm unit -->
    <desc>
      8-bit integer dot product between 4 channel vectors, intended for machine
      learning. Available in both unsigned and signed variants, controlling
@ -1500,7 +1503,7 @@
    <saturate/>
  </group>

-  <group name="ICMP" title="Unsigned integer compare" dests="1">
+  <group name="ICMP" title="Unsigned integer compare" dests="1" unit="CVT">
    <desc>
      Evaluates the given condition, do a logical and/or with the condition in
      the result source, and return in the given result type (integer
@ -1528,7 +1531,7 @@
    <src>C</src>
  </group>

-  <group name="FCMP" title="Floating-point compare" dests="1">
+  <group name="FCMP" title="Floating-point compare" dests="1" unit="CVT">
    <desc>
      Evaluates the given condition, do a logical and/or with the condition in
      the result source, and return in the given result type (integer
@ -1547,7 +1550,7 @@
    <src>C</src>
  </group>

-  <group name="ICMP" title="Signed integer compare" dests="1">
+  <group name="ICMP" title="Signed integer compare" dests="1" unit="CVT">
    <desc>
      Evaluates the given condition, do a logical and/or with the condition in
      the result source, and return in the given result type (integer
@ -1575,7 +1578,7 @@
    <src>C</src>
  </group>

-  <ins name="IADD_IMM.i32" title="Integer addition with immediate" dests="1" opcode="0x110">
+  <ins name="IADD_IMM.i32" title="Integer addition with immediate" dests="1" opcode="0x110" unit="CVT">
    <desc>
      Adds an arbitrary 32-bit immediate embedded within the instruction stream.
      If no modifiers are required, this is preferred to `IADD.i32` with a
@ -1588,7 +1591,7 @@
    <imm name="constant" start="8" size="32"/>
  </ins>

-  <ins name="IADD_IMM.v2i16" title="Integer addition with immediate" dests="1" opcode="0x111">
+  <ins name="IADD_IMM.v2i16" title="Integer addition with immediate" dests="1" opcode="0x111" unit="CVT">
    <desc>
      Adds an arbitrary pair of 16-bit immediates embedded within the
      instruction stream. If no modifiers are required, this is preferred to
@ -1600,7 +1603,7 @@
    <imm name="constant" start="8" size="32"/>
  </ins>

-  <ins name="IADD_IMM.v4i8" title="Integer addition with immediate" dests="1" opcode="0x112">
+  <ins name="IADD_IMM.v4i8" title="Integer addition with immediate" dests="1" opcode="0x112" unit="CVT">
    <desc>
      Adds an arbitrary quad of 8-bit immediates embedded within the
      instruction stream. If no modifiers are required, this is preferred to
@ -1612,7 +1615,7 @@
    <imm name="constant" start="8" size="32"/>
  </ins>

-  <ins name="FADD_IMM.f32" title="Floating-point addition with immediate" dests="1" opcode="0x114">
+  <ins name="FADD_IMM.f32" title="Floating-point addition with immediate" dests="1" opcode="0x114" unit="FMA">
    <desc>
      Adds an arbitrary 32-bit immediate embedded within the instruction stream.
      If no modifiers are required, this is preferred to `FADD.f32` with a
@ -1623,7 +1626,7 @@
    <imm name="constant" start="8" size="32"/>
  </ins>

-  <ins name="FADD_IMM.v2f16" title="Floating-point addition with immediate" dests="1" opcode="0x115">
+  <ins name="FADD_IMM.v2f16" title="Floating-point addition with immediate" dests="1" opcode="0x115" unit="FMA">
    <desc>
      Adds an arbitrary pair of 16-bit immediates embedded within the
      instruction stream. If no modifiers are required, this is preferred to
@ -1635,7 +1638,7 @@
    <imm name="constant" start="8" size="32"/>
  </ins>

-  <ins name="TODO.ATOM_C1" title="Atomic operations on memory with 1" opcode="0x69">
+  <ins name="TODO.ATOM_C1" title="Atomic operations on memory with 1" opcode="0x69" unit="LS">
    <!-- TODO -->
    <mod name="i32" start="17" size="1"/>
    <mod name="unk" start="23" size="1"/>
@ -1646,7 +1649,7 @@
    <slot/>
  </ins>

-  <ins name="TODO.ATOM_C" title="Atomic operations on memory" opcode="0x120">
+  <ins name="TODO.ATOM_C" title="Atomic operations on memory" opcode="0x120" unit="LS">
    <!-- TODO -->
    <mod name="i32" start="17" size="1"/>
    <mod name="unk" start="23" size="1"/>
@ -1657,7 +1660,7 @@
    <slot/>
  </ins>

-  <ins name="TEX_FETCH" title="Texel fetch" opcode="0x125">
+  <ins name="TEX_FETCH" title="Texel fetch" opcode="0x125" unit="T">
    <desc>Unfiltered textured instruction.</desc>
    <sr read="true"/>
    <sr write="true" count="4"/>
@ -1669,7 +1672,7 @@
    <src>Image to read from</src>
  </ins>

-  <ins name="TEX" title="Texture load" opcode="0x128">
+  <ins name="TEX" title="Texture load" opcode="0x128" unit="T">
    <desc>Ordinary texturing instruction using a sampler.</desc>
    <sr read="true"/>
    <sr write="true" count="4"/>
@ -1683,8 +1686,11 @@
    <slot/>
  </ins>

-  <ins name="TODO.VAR_TEX" title="Fused varying-texturing" opcode="0x130">
-    <desc>Only works for FP32 varyings.</desc>
+  <ins name="TODO.VAR_TEX" title="Fused varying-texturing" opcode="0x130" unit="VT">
+    <desc>
+      Only works for FP32 varyings. Performance characteristics are similar
+      to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
+    </desc>
    <sr write="true" count="4"/>
    <mod name="dimension" start="28" size="2"/>
    <mod name="skip" start="39" size="1"/>
@ -1692,7 +1698,7 @@
    <src>Image to read from</src>
  </ins>

-  <ins name="FMA_RSCALE.f32" title="Fused floating-point multiply add with exponent bias" dests="1" opcode="0x160">
+  <ins name="FMA_RSCALE.f32" title="Fused floating-point multiply add with exponent bias" dests="1" opcode="0x160" unit="FMA">
    <desc>
      First calculates $A \cdot B + C$ and then biases the exponent by D. Used in
      special transcendental function sequences. It should not be used for