intel/elk: Remove Gfx9+ from passes

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27629>
2026-05-05 18:18:06 +02:00 · 2024-02-12 19:10:18 -08:00 · 2024-02-12 19:10:18 -08:00 · 241a03b8ec
commit 241a03b8ec
parent 2b6b786feb
8 changed files with 53 additions and 542 deletions
--- a/src/intel/compiler/elk/elk_fs_bank_conflicts.cpp
+++ b/src/intel/compiler/elk/elk_fs_bank_conflicts.cpp
@ -591,21 +591,6 @@ namespace {
      return constrained;
   }

-   /**
-    * Return whether the hardware will be able to prevent a bank conflict by
-    * optimizing out the read cycle of a source register.  The formula was
-    * found experimentally.
-    */
-   bool
-   is_conflict_optimized_out(const intel_device_info *devinfo,
-                             const elk_fs_inst *inst)
-   {
-      return devinfo->ver >= 9 &&
-         ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
-                                    reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
-          reg_of(inst->src[1]) == reg_of(inst->src[2]));
-   }
-
   /**
    * Return a matrix that allows reasonably efficient computation of the
    * cycle-count cost of bank conflicts incurred throughout the whole program
@ -662,10 +647,9 @@ namespace {
                                                                    REG_SIZE);

            /* Neglect same-atom conflicts (since they're either trivial or
-             * impossible to avoid without splitting the atom), and conflicts
-             * known to be optimized out by the hardware.
+             * impossible to avoid without splitting the atom).
             */
-            if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
+            if (r != s) {
               /* Calculate the parity of the sources relative to the start of
                * their respective atoms.  If their parity is the same (and
                * none of the atoms straddle the 2KB mark), the instruction
@ -911,10 +895,6 @@ elk_fs_visitor::opt_bank_conflicts()
 {
   assert(grf_used || !"Must be called after register allocation");

-   /* TODO: Re-work this pass for Gfx20+. */
-   if (devinfo->ver >= 20)
-      return false;
-
   /* No ternary instructions -- No bank conflicts. */
   if (devinfo->ver < 6)
      return false;
@ -950,6 +930,5 @@ elk_has_bank_conflict(const struct elk_isa_info *isa, const elk_fs_inst *inst)
 {
   return elk_is_3src(isa, inst->opcode) &&
          is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
-          bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
-          !is_conflict_optimized_out(isa->devinfo, inst);
+          bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2]));
 }
--- a/src/intel/compiler/elk/elk_fs_combine_constants.cpp
+++ b/src/intel/compiler/elk/elk_fs_combine_constants.cpp
@ -993,120 +993,6 @@ get_alignment_for_imm(const struct imm *imm)
      return imm->size;
 }

-static bool
-representable_as_hf(float f, uint16_t *hf)
-{
-   union fi u;
-   uint16_t h = _mesa_float_to_half(f);
-   u.f = _mesa_half_to_float(h);
-
-   if (u.f == f) {
-      *hf = h;
-      return true;
-   }
-
-   return false;
-}
-
-static bool
-representable_as_w(int d, int16_t *w)
-{
-   int res = ((d & 0xffff8000) + 0x8000) & 0xffff7fff;
-   if (!res) {
-      *w = d;
-      return true;
-   }
-
-   return false;
-}
-
-static bool
-representable_as_uw(unsigned ud, uint16_t *uw)
-{
-   if (!(ud & 0xffff0000)) {
-      *uw = ud;
-      return true;
-   }
-
-   return false;
-}
-
-static bool
-supports_src_as_imm(const struct intel_device_info *devinfo, const elk_fs_inst *inst)
-{
-   if (devinfo->ver < 12)
-      return false;
-
-   switch (inst->opcode) {
-   case ELK_OPCODE_MAD:
-      /* Integer types can always mix sizes. Floating point types can mix
-       * sizes on Gfx12. On Gfx12.5, floating point sources must all be HF or
-       * all be F.
-       */
-      return devinfo->verx10 < 125 || inst->src[0].type != ELK_REGISTER_TYPE_F;
-
-   default:
-      return false;
-   }
-}
-
-static bool
-can_promote_src_as_imm(const struct intel_device_info *devinfo, elk_fs_inst *inst,
-                       unsigned src_idx)
-{
-   bool can_promote = false;
-
-   /* Experiment shows that we can only support src0 as immediate for MAD on
-    * Gfx12. ADD3 can use src0 or src2 in Gfx12.5, but constant propagation
-    * only propagates into src0. It's possible that src2 works for W or UW MAD
-    * on Gfx12.5.
-    */
-   if (src_idx != 0)
-      return false;
-
-   if (!supports_src_as_imm(devinfo, inst))
-      return false;
-
-   /* TODO - Fix the codepath below to use a bfloat16 immediate on XeHP,
-    *        since HF/F mixed mode has been removed from the hardware.
-    */
-   switch (inst->src[src_idx].type) {
-   case ELK_REGISTER_TYPE_F: {
-      uint16_t hf;
-      if (representable_as_hf(inst->src[src_idx].f, &hf)) {
-         inst->src[src_idx] = retype(elk_imm_uw(hf), ELK_REGISTER_TYPE_HF);
-         can_promote = true;
-      }
-      break;
-   }
-   case ELK_REGISTER_TYPE_D: {
-      int16_t w;
-      if (representable_as_w(inst->src[src_idx].d, &w)) {
-         inst->src[src_idx] = elk_imm_w(w);
-         can_promote = true;
-      }
-      break;
-   }
-   case ELK_REGISTER_TYPE_UD: {
-      uint16_t uw;
-      if (representable_as_uw(inst->src[src_idx].ud, &uw)) {
-         inst->src[src_idx] = elk_imm_uw(uw);
-         can_promote = true;
-      }
-      break;
-   }
-   case ELK_REGISTER_TYPE_W:
-   case ELK_REGISTER_TYPE_UW:
-   case ELK_REGISTER_TYPE_HF:
-      can_promote = true;
-      break;
-   default:
-      break;
-   }
-
-   return can_promote;
-}
-
 static void
 add_candidate_immediate(struct table *table, elk_fs_inst *inst, unsigned ip,
                        unsigned i,
@ -1358,9 +1244,6 @@ elk_fs_visitor::opt_combine_constants()
            if (inst->src[i].file != IMM)
               continue;

-            if (can_promote_src_as_imm(devinfo, inst, i))
-               continue;
-
            add_candidate_immediate(&table, inst, ip, i, true, false, block,
                                    devinfo, const_ctx);
         }
--- a/src/intel/compiler/elk/elk_fs_lower_pack.cpp
+++ b/src/intel/compiler/elk/elk_fs_lower_pack.cpp
@ -64,7 +64,7 @@ elk_fs_visitor::lower_pack()
               const uint32_t half = _mesa_float_to_half(inst->src[i].f);
               ibld.MOV(subscript(dst, ELK_REGISTER_TYPE_UW, i),
                        elk_imm_uw(half));
-            } else if (i == 1 && devinfo->ver < 9) {
+            } else if (i == 1) {
               /* Pre-Skylake requires DWord aligned destinations */
               elk_fs_reg tmp = ibld.vgrf(ELK_REGISTER_TYPE_UD);
               ibld.F32TO16(subscript(tmp, ELK_REGISTER_TYPE_HF, 0),
--- a/src/intel/compiler/elk/elk_fs_lower_regioning.cpp
+++ b/src/intel/compiler/elk/elk_fs_lower_regioning.cpp
@ -145,9 +145,8 @@ namespace {
          * Work around both of the above and handle platforms that
          * don't support 64-bit types at all.
          */
-         if ((!devinfo->has_64bit_int ||
-              devinfo->platform == INTEL_PLATFORM_CHV ||
-              intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
+         if ((!devinfo->has_64bit_int || devinfo->platform == INTEL_PLATFORM_CHV) &&
+             type_sz(t) > 4)
            return ELK_REGISTER_TYPE_UD;
         else if (has_dst_aligned_region_restriction(devinfo, inst))
            return elk_int_type(type_sz(t), false);
@ -174,30 +173,19 @@ namespace {
          *    integer DWord multiply, indirect addressing must not be
          *    used."
          *
-          * For MTL (verx10 == 125), float64 is supported, but int64 is not.
-          * Therefore we need to lower cluster broadcast using 32-bit int ops.
-          *
-          * For gfx12.5+ platforms that support int64, the register regions
-          * used by cluster broadcast aren't supported by the 64-bit pipeline.
-          *
          * Work around the above and handle platforms that don't
          * support 64-bit types at all.
          */
-         if ((!has_64bit || devinfo->verx10 >= 125 ||
-              devinfo->platform == INTEL_PLATFORM_CHV ||
-              intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
+         if ((!has_64bit || devinfo->platform == INTEL_PLATFORM_CHV) &&
+             type_sz(t) > 4)
            return ELK_REGISTER_TYPE_UD;
         else
            return elk_int_type(type_sz(t), false);

      case ELK_SHADER_OPCODE_BROADCAST:
      case ELK_SHADER_OPCODE_MOV_INDIRECT:
-         if (((devinfo->verx10 == 70 ||
-               devinfo->platform == INTEL_PLATFORM_CHV ||
-               intel_device_info_is_9lp(devinfo) ||
-               devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
-             (devinfo->verx10 >= 125 &&
-              elk_reg_type_is_floating_point(inst->src[0].type)))
+         if ((devinfo->verx10 == 70 || devinfo->platform == INTEL_PLATFORM_CHV) &&
+             type_sz(inst->src[0].type) > 4)
            return elk_int_type(type_sz(t), false);
         else
            return t;
--- a/src/intel/compiler/elk/elk_fs_reg_allocate.cpp
+++ b/src/intel/compiler/elk/elk_fs_reg_allocate.cpp
@ -334,7 +334,6 @@ public:
      node_count = 0;
      first_payload_node = 0;
      first_mrf_hack_node = 0;
-      scratch_header_node = 0;
      grf127_send_hack_node = 0;
      first_vgrf_node = 0;
      last_vgrf_node = 0;
@ -372,7 +371,6 @@ private:

   void set_spill_costs();
   int choose_spill_reg();
-   elk_fs_reg alloc_scratch_header();
   elk_fs_reg alloc_spill_reg(unsigned size, int ip);
   void spill_reg(unsigned spill_reg);

@ -397,7 +395,6 @@ private:
   int node_count;
   int first_payload_node;
   int first_mrf_hack_node;
-   int scratch_header_node;
   int grf127_send_hack_node;
   int first_vgrf_node;
   int last_vgrf_node;
@ -466,8 +463,7 @@ namespace {
   spill_max_size(const elk_backend_shader *s)
   {
      /* LSC is limited to SIMD16 sends */
-      if (s->devinfo->has_lsc)
-         return 2;
+      assert(!s->devinfo->has_lsc);

      /* FINISHME - On Gfx7+ it should be possible to avoid this limit
       *            altogether by spilling directly from the temporary GRF
@ -487,8 +483,6 @@ namespace {
   unsigned
   spill_base_mrf(const elk_backend_shader *s)
   {
-      /* We don't use the MRF hack on Gfx9+ */
-      assert(s->devinfo->ver < 9);
      return ELK_MAX_MRF(s->devinfo->ver) - spill_max_size(s) - 1;
   }
 }
@ -520,10 +514,6 @@ elk_fs_reg_alloc::setup_live_interference(unsigned node,
         ra_add_node_interference(g, node, first_mrf_hack_node + i);
   }

-   /* Everything interferes with the scratch header */
-   if (scratch_header_node >= 0)
-      ra_add_node_interference(g, node, scratch_header_node);
-
   /* Add interference with every vgrf whose live range intersects this
    * node's.  We only need to look at nodes below this one as the reflexivity
    * of interference will take care of the rest.
@ -643,7 +633,7 @@ elk_fs_reg_alloc::build_interference_graph(bool allow_spilling)
   node_count = 0;
   first_payload_node = node_count;
   node_count += payload_node_count;
-   if (devinfo->ver >= 7 && devinfo->ver < 9 && allow_spilling) {
+   if (devinfo->ver >= 7 && allow_spilling) {
      first_mrf_hack_node = node_count;
      node_count += ELK_MAX_GRF - GFX7_MRF_HACK_START;
   } else {
@ -658,11 +648,6 @@ elk_fs_reg_alloc::build_interference_graph(bool allow_spilling)
   first_vgrf_node = node_count;
   node_count += fs->alloc.count;
   last_vgrf_node = node_count - 1;
-   if ((devinfo->ver >= 9 && devinfo->verx10 < 125) && allow_spilling) {
-      scratch_header_node = node_count++;
-   } else {
-      scratch_header_node = -1;
-   }
   first_spill_node = node_count;

   fs->calculate_payload_ranges(payload_node_count,
@ -802,30 +787,7 @@ elk_fs_reg_alloc::emit_unspill(const fs_builder &bld,
      ++stats->fill_count;

      elk_fs_inst *unspill_inst;
-      if (devinfo->ver >= 9) {
-         elk_fs_reg header = this->scratch_header;
-         fs_builder ubld = bld.exec_all().group(1, 0);
-         assert(spill_offset % 16 == 0);
-         unspill_inst = ubld.MOV(component(header, 2),
-                                 elk_imm_ud(spill_offset / 16));
-         _mesa_set_add(spill_insts, unspill_inst);
-
-         const unsigned bti = GFX8_BTI_STATELESS_NON_COHERENT;
-
-         elk_fs_reg srcs[] = { elk_imm_ud(0), header };
-         unspill_inst = bld.emit(ELK_SHADER_OPCODE_SEND, dst,
-                                 srcs, ARRAY_SIZE(srcs));
-         unspill_inst->mlen = 1;
-         unspill_inst->header_size = 1;
-         unspill_inst->size_written = reg_size * REG_SIZE;
-         unspill_inst->send_has_side_effects = false;
-         unspill_inst->send_is_volatile = true;
-         unspill_inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
-         unspill_inst->desc =
-            elk_dp_desc(devinfo, bti,
-                        ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
-                        ELK_DATAPORT_OWORD_BLOCK_DWORDS(reg_size * 8));
-      } else if (devinfo->ver >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
+      if (devinfo->ver >= 7 && spill_offset < (1 << 12) * REG_SIZE) {
         /* The Gfx7 descriptor-based offset is 12 bits of HWORD units.
          * Because the Gfx7-style scratch block read is hardwired to BTI 255,
          * on Gfx9+ it would cause the DC to do an IA-coherent read, what
@ -974,19 +936,6 @@ elk_fs_reg_alloc::choose_spill_reg()
   return node - first_vgrf_node;
 }

-elk_fs_reg
-elk_fs_reg_alloc::alloc_scratch_header()
-{
-   int vgrf = fs->alloc.allocate(1);
-   assert(first_vgrf_node + vgrf == scratch_header_node);
-   ra_set_node_class(g, scratch_header_node,
-                        compiler->fs_reg_sets[rsi].classes[0]);
-
-   setup_live_interference(scratch_header_node, 0, INT_MAX);
-
-   return elk_fs_reg(VGRF, vgrf, ELK_REGISTER_TYPE_UD);
-}
-
 elk_fs_reg
 elk_fs_reg_alloc::alloc_spill_reg(unsigned size, int ip)
 {
@ -1035,25 +984,13 @@ elk_fs_reg_alloc::spill_reg(unsigned spill_reg)
    * SIMD16 mode, because we'd stomp the FB writes.
    */
   if (!fs->spilled_any_registers) {
-      if (devinfo->verx10 >= 125) {
-         /* We will allocate a register on the fly */
-      } else if (devinfo->ver >= 9) {
-         this->scratch_header = alloc_scratch_header();
-         fs_builder ubld = fs_builder(fs, 8).exec_all().at(
-            fs->cfg->first_block(), fs->cfg->first_block()->start());
+      bool mrf_used[ELK_MAX_MRF(devinfo->ver)];
+      get_used_mrfs(fs, mrf_used);

-         elk_fs_inst *inst = ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER,
-                                   this->scratch_header);
-         _mesa_set_add(spill_insts, inst);
-      } else {
-         bool mrf_used[ELK_MAX_MRF(devinfo->ver)];
-         get_used_mrfs(fs, mrf_used);
-
-         for (int i = spill_base_mrf(fs); i < ELK_MAX_MRF(devinfo->ver); i++) {
-            if (mrf_used[i]) {
-               fs->fail("Register spilling not supported with m%d used", i);
-             return;
-            }
+      for (int i = spill_base_mrf(fs); i < ELK_MAX_MRF(devinfo->ver); i++) {
+         if (mrf_used[i]) {
+            fs->fail("Register spilling not supported with m%d used", i);
+          return;
         }
      }

--- a/src/intel/compiler/elk/elk_ir_performance.cpp
+++ b/src/intel/compiler/elk/elk_ir_performance.cpp
@ -349,10 +349,7 @@ namespace {
      case ELK_TCS_OPCODE_GET_PRIMITIVE_ID:
      case ELK_TES_OPCODE_GET_PRIMITIVE_ID:
      case ELK_SHADER_OPCODE_READ_SR_REG:
-         if (devinfo->ver >= 11) {
-            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
-                                  0, 10, 6 /* XXX */, 14, 0, 0);
-         } else if (devinfo->ver >= 8) {
+         if (devinfo->ver >= 8) {
            if (type_sz(info.tx) > 4)
               return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
@ -373,10 +370,7 @@ namespace {
      case ELK_OPCODE_MUL:
      case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
      case ELK_VEC4_OPCODE_MOV_FOR_SCRATCH:
-         if (devinfo->ver >= 11) {
-            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
-                                  0, 10, 6, 14, 0, 0);
-         } else if (devinfo->ver >= 8) {
+         if (devinfo->ver >= 8) {
            if (type_sz(info.tx) > 4)
               return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
@ -407,10 +401,7 @@ namespace {
      case ELK_OPCODE_BFE:
      case ELK_OPCODE_BFI2:
      case ELK_OPCODE_CSEL:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
-                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
         else if (devinfo->verx10 >= 75)
@ -423,10 +414,7 @@ namespace {
            abort();

      case ELK_OPCODE_MAD:
-         if (devinfo->ver >= 11) {
-            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 1, 0, 2,
-                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
-         } else if (devinfo->ver >= 8) {
+         if (devinfo->ver >= 8) {
            if (type_sz(info.tx) > 4)
               return calculate_desc(info, EU_UNIT_FPU, 0, 4, 1, 0, 4,
                                     0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
@ -457,10 +445,7 @@ namespace {
         }

      case ELK_OPCODE_F32TO16:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
-                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 0, 4, 0, 0, 4,
                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
         else if (devinfo->verx10 >= 75)
@ -619,11 +604,7 @@ namespace {
            abort();

      case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 20, 6, 0, 0, 6,
-                                  0, 10 /* XXX */, 6 /* XXX */,
-                                  14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 16, 6, 0, 0, 6,
                                  0, 8 /* XXX */, 4 /* XXX */,
                                  12 /* XXX */, 0, 0);
@ -639,11 +620,7 @@ namespace {
            abort();

      case ELK_SHADER_OPCODE_MOV_INDIRECT:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
-                                  0, 10 /* XXX */, 6 /* XXX */,
-                                  14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 34, 0, 0, 34, 0,
                                  0, 8 /* XXX */, 4 /* XXX */,
                                  12 /* XXX */, 0, 0);
@ -657,10 +634,7 @@ namespace {
                                  18 /* XXX */, 0, 0);

      case ELK_SHADER_OPCODE_BROADCAST:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0, 4, 0,
-                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 18, 0, 0, 4, 0,
                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
         else if (devinfo->verx10 >= 75)
@ -674,10 +648,7 @@ namespace {

      case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
      case ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
-                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 2, 0, 0, 2, 0,
                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
         else if (devinfo->verx10 >= 75)
@ -691,11 +662,7 @@ namespace {

      case ELK_SHADER_OPCODE_RND_MODE:
      case ELK_SHADER_OPCODE_FLOAT_CONTROL_MODE:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 24 /* XXX */, 0, 0,
-                                  4 /* XXX */, 0,
-                                  0, 0, 0, 0, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 20 /* XXX */, 0, 0,
                                  4 /* XXX */, 0,
                                  0, 0, 0, 0, 0, 0);
@ -711,12 +678,7 @@ namespace {
            abort();

      case ELK_SHADER_OPCODE_SHUFFLE:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 44 /* XXX */, 0, 0,
-                                  44 /* XXX */, 0,
-                                  0, 10 /* XXX */, 6 /* XXX */,
-                                  14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 42 /* XXX */, 0, 0,
                                  42 /* XXX */, 0,
                                  0, 8 /* XXX */, 4 /* XXX */,
@ -735,12 +697,7 @@ namespace {
            abort();

      case ELK_SHADER_OPCODE_SEL_EXEC:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 10 /* XXX */, 4 /* XXX */, 0,
-                                  0, 4 /* XXX */,
-                                  0, 10 /* XXX */, 6 /* XXX */,
-                                  14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 8 /* XXX */, 4 /* XXX */, 0,
                                  0, 4 /* XXX */,
                                  0, 8 /* XXX */, 4 /* XXX */,
@ -757,12 +714,7 @@ namespace {
                                  18 /* XXX */, 0, 0);

      case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
-                                  0, 8 /* XXX */,
-                                  0, 10 /* XXX */, 6 /* XXX */,
-                                  14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 0 /* XXX */, 8 /* XXX */, 0,
                                  0, 8 /* XXX */,
                                  0, 8 /* XXX */, 4 /* XXX */,
@ -779,10 +731,7 @@ namespace {
                                  18 /* XXX */, 0, 0);

      case ELK_FS_OPCODE_DDY_FINE:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 0, 14, 0, 0, 4,
-                                  0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 0, 2, 0, 0, 2,
                                  0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
         else if (devinfo->verx10 >= 75)
@ -793,11 +742,7 @@ namespace {
                                  0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);

      case ELK_FS_OPCODE_LOAD_LIVE_CHANNELS:
-         if (devinfo->ver >= 11)
-            return calculate_desc(info, EU_UNIT_FPU, 2 /* XXX */, 0, 0,
-                                  2 /* XXX */, 0,
-                                  0, 0, 0, 10 /* XXX */, 0, 0);
-         else if (devinfo->ver >= 8)
+         if (devinfo->ver >= 8)
            return calculate_desc(info, EU_UNIT_FPU, 0, 2 /* XXX */, 0,
                                  0, 2 /* XXX */,
                                  0, 0, 0, 8 /* XXX */, 0, 0);
@ -1493,16 +1438,8 @@ namespace {
       *
       *       In the meantime use values that roughly match the control flow
       *       weights used elsewhere in the compiler back-end.
-       *
-       *       Note that we provide slightly more pessimistic weights on
-       *       Gfx12+ for SIMD32, since the effective warp size on that
-       *       platform is 2x the SIMD width due to EU fusion, which increases
-       *       the likelihood of divergent control flow in comparison to
-       *       previous generations, giving narrower SIMD modes a performance
-       *       advantage in several test-cases with non-uniform discard jumps.
       */
-      const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ?
-                                    1.0 : 0.5);
+      const float discard_weight = 1.0;
      const float loop_weight = 10;
      unsigned halt_count = 0;
      unsigned elapsed = 0;
--- a/src/intel/compiler/elk/elk_lower_logical_sends.cpp
+++ b/src/intel/compiler/elk/elk_lower_logical_sends.cpp
@ -192,12 +192,9 @@ lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,

      assert(length == 0);
      length = 2;
-   } else if ((devinfo->verx10 <= 70 &&
-               prog_data->uses_kill) ||
-              (devinfo->ver < 11 &&
-               (color1.file != BAD_FILE || key->nr_color_regions > 1))) {
-      assert(devinfo->ver < 20);
-
+   } else if ((devinfo->verx10 <= 70 && prog_data->uses_kill) ||
+              color1.file != BAD_FILE ||
+              key->nr_color_regions > 1) {
      /* From the Sandy Bridge PRM, volume 4, page 198:
       *
       *     "Dispatched Pixel Enables. One bit per pixel indicating
@ -220,9 +217,6 @@ lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,
            retype(elk_vec8_grf(2, 0), ELK_REGISTER_TYPE_UD),
         };
         ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
-
-         /* Gfx12 will require additional fix-ups if we ever hit this path. */
-         assert(devinfo->ver < 12);
      }

      uint32_t g00_bits = 0;
@ -723,7 +717,6 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
                                unsigned grad_components,
                                bool residency)
 {
-   const elk_compiler *compiler = bld.shader->compiler;
   const intel_device_info *devinfo = bld.shader->devinfo;
   const enum elk_reg_type payload_type =
      elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_F);
@ -802,26 +795,11 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
          * address space but means we can do something more efficient in the
          * shader.
          */
-         if (compiler->use_bindless_sampler_offset) {
-            assert(devinfo->ver >= 11);
-            ubld1.OR(component(header, 3), sampler_handle, elk_imm_ud(1));
-         } else {
-            ubld1.MOV(component(header, 3), sampler_handle);
-         }
+         ubld1.MOV(component(header, 3), sampler_handle);
      } else if (is_high_sampler(devinfo, sampler)) {
         elk_fs_reg sampler_state_ptr =
            retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD);

-         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
-          * with the ones included in g0.3 bits 4:0.  Mask them out.
-          */
-         if (devinfo->ver >= 11) {
-            sampler_state_ptr = ubld1.vgrf(ELK_REGISTER_TYPE_UD);
-            ubld1.AND(sampler_state_ptr,
-                      retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
-                      elk_imm_ud(INTEL_MASK(31, 5)));
-         }
-
         if (sampler.file == ELK_IMMEDIATE_VALUE) {
            assert(sampler.ud >= 16);
            const int sampler_state_size = 16; /* 16 bytes */
@ -834,26 +812,9 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
            ubld1.SHL(tmp, tmp, elk_imm_ud(4));
            ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
         }
-      } else if (devinfo->ver >= 11) {
-         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
-          * with the ones included in g0.3 bits 4:0.  Mask them out.
-          */
-         ubld1.AND(component(header, 3),
-                   retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
-                   elk_imm_ud(INTEL_MASK(31, 5)));
      }
   }

-   /* Change the opcode to account for LOD being zero before the
-    * switch-statement that emits sources based on the opcode.
-    */
-   if (devinfo->ver >= 9 && lod.is_zero()) {
-      if (op == ELK_SHADER_OPCODE_TXL)
-         op = ELK_SHADER_OPCODE_TXL_LZ;
-      else if (op == ELK_SHADER_OPCODE_TXF)
-         op = ELK_SHADER_OPCODE_TXF_LZ;
-   }
-
   /* On Xe2 and newer platforms, min_lod is the first parameter specifically
    * so that a bunch of other, possibly unused, parameters don't need to also
    * be included.
@ -910,27 +871,15 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
      break;
   case ELK_SHADER_OPCODE_TXF:
   case ELK_SHADER_OPCODE_TXF_LZ:
-      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
-       * On Gfx9 they are u, v, lod, r
-       */
+      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
      bld.MOV(retype(sources[length++], payload_signed_type), coordinate);

-      if (devinfo->ver >= 9) {
-         if (coord_components >= 2) {
-            bld.MOV(retype(sources[length], payload_signed_type),
-                    offset(coordinate, bld, 1));
-         } else {
-            sources[length] = elk_imm_d(0);
-         }
-         length++;
-      }
-
      if (op != ELK_SHADER_OPCODE_TXF_LZ) {
         bld.MOV(retype(sources[length], payload_signed_type), lod);
         length++;
      }

-      for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
+      for (unsigned i = 1; i < coord_components; i++)
         bld.MOV(retype(sources[length++], payload_signed_type),
                 offset(coordinate, bld, i));

@ -964,24 +913,9 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op
             * only valid data is in first two register. So with 16-bit
             * payload, we need to split 2-32bit register into 4-16-bit
             * payload.
-             *
-             * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
-             * Shared Functions - 3D Sampler - Messages - Message Format:
-             *
-             *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
             */
-            if (devinfo->verx10 >= 125 && op == ELK_SHADER_OPCODE_TXF_CMS_W) {
-               elk_fs_reg tmp = offset(mcs, bld, i);
-               bld.MOV(retype(sources[length++], payload_unsigned_type),
-                       mcs.file == IMM ? mcs :
-                       subscript(tmp, payload_unsigned_type, 0));
-               bld.MOV(retype(sources[length++], payload_unsigned_type),
-                       mcs.file == IMM ? mcs :
-                       subscript(tmp, payload_unsigned_type, 1));
-            } else {
-               bld.MOV(retype(sources[length++], payload_unsigned_type),
-                       mcs.file == IMM ? mcs : offset(mcs, bld, i));
-            }
+            bld.MOV(retype(sources[length++], payload_unsigned_type),
+                    mcs.file == IMM ? mcs : offset(mcs, bld, i));
         }
      }

@ -1021,29 +955,11 @@ lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_op

   if (min_lod.file != BAD_FILE) {
      /* Account for all of the missing coordinate sources */
-      if (op == ELK_SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
-         /* On DG2 and newer platforms, sample_d can only be used with 1D and
-          * 2D surfaces, so the maximum number of gradient components is 2.
-          * In spite of this limitation, the Bspec lists a mysterious R
-          * component before the min_lod, so the maximum coordinate components
-          * is 3.
-          *
-          * See bspec 45942, "Enable new message layout for cube array"
-          */
-         length += 3 - coord_components;
-         length += (2 - grad_components) * 2;
-      } else {
-         length += 4 - coord_components;
-         if (op == ELK_SHADER_OPCODE_TXD)
-            length += (3 - grad_components) * 2;
-      }
+      length += 4 - coord_components;
+      if (op == ELK_SHADER_OPCODE_TXD)
+         length += (3 - grad_components) * 2;

      bld.MOV(sources[length++], min_lod);
-
-      /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
-       if (devinfo->verx10 == 125 && op == ELK_FS_OPCODE_TXB &&
-          !inst->shadow_compare)
-         bld.MOV(sources[length++], min_lod);
   }

   const elk_fs_reg src_payload =
@ -1145,39 +1061,13 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
   assert(src_type_size == 2 || src_type_size == 4);

 #ifndef NDEBUG
-   /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
-    * compressed multisampled surfaces. There the payload contains MCS data
-    * which is already in 16-bits unlike the other parameters that need forced
-    * conversion.
-    */
-   if (devinfo->verx10 < 125 ||
-       (op != ELK_SHADER_OPCODE_TXF_CMS_W &&
-        op != ELK_SHADER_OPCODE_TXF_CMS)) {
-      for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
-         assert(src[i].file == BAD_FILE ||
-                elk_reg_type_to_size(src[i].type) == src_type_size);
-      }
+   /* Make sure all sources agree. */
+   for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
+      assert(src[i].file == BAD_FILE ||
+             elk_reg_type_to_size(src[i].type) == src_type_size);
   }
 #endif

-   if (devinfo->verx10 < 125)
-      return src_type_size * 8;
-
-   /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
-    * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
-    * Format [GFX12:HAS:1209977870] *
-    *
-    *  ld2dms_w       SIMD8H and SIMD16H Only
-    *  ld_mcs         SIMD8H and SIMD16H Only
-    *  ld2dms         REMOVEDBY(GEN:HAS:1406788836)
-    */
-
-   if (op == ELK_SHADER_OPCODE_TXF_CMS_W ||
-       op == ELK_SHADER_OPCODE_TXF_CMS ||
-       op == ELK_SHADER_OPCODE_TXF_UMS ||
-       op == ELK_SHADER_OPCODE_TXF_MCS)
-      src_type_size = 2;
-
   return src_type_size * 8;
 }

@ -1211,7 +1101,7 @@ lower_sampler_logical_send(const fs_builder &bld, elk_fs_inst *inst, elk_opcode
         get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);

      /* 16-bit payloads are available only on gfx11+ */
-      assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
+      assert(msg_payload_type_bit_size != 16);

      lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
                                      shadow_c, lod, lod2, min_lod,
@ -1259,7 +1149,6 @@ emit_predicate_on_vector_mask(const fs_builder &bld, elk_fs_inst *inst)
      assert(inst->predicate == ELK_PREDICATE_NORMAL);
      assert(!inst->predicate_inverse);
      assert(inst->flag_subreg == 0);
-      assert(s.devinfo->ver < 20);
      /* Combine the vector mask with the existing predicate by using a
       * vertical predication mode.
       */
@ -1345,7 +1234,7 @@ lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
    * For all stateless A32 messages, we also need a header
    */
   elk_fs_reg header;
-   if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
+   if (is_typed_access || is_stateless) {
      fs_builder ubld = bld.exec_all().group(8, 0);
      header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
      if (is_stateless) {
@ -1515,71 +1404,6 @@ lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
   inst->src[1] = payload;
 }

-static void
-lower_surface_block_logical_send(const fs_builder &bld, elk_fs_inst *inst)
-{
-   const intel_device_info *devinfo = bld.shader->devinfo;
-   assert(devinfo->ver >= 9);
-
-   /* Get the logical send arguments. */
-   const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
-   const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
-   const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
-   const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
-   const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
-   assert(arg.file == IMM);
-   assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
-   assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
-
-   const bool is_stateless =
-      surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
-                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
-
-   const bool has_side_effects = inst->has_side_effects();
-
-   const bool align_16B =
-      inst->opcode != ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
-
-   const bool write = inst->opcode == ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
-
-   /* The address is stored in the header.  See MH_A32_GO and MH_BTS_GO. */
-   fs_builder ubld = bld.exec_all().group(8, 0);
-   elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
-
-   if (is_stateless)
-      ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER, header);
-   else
-      ubld.MOV(header, elk_imm_d(0));
-
-   /* Address in OWord units when aligned to OWords. */
-   if (align_16B)
-      ubld.group(1, 0).SHR(component(header, 2), addr, elk_imm_ud(4));
-   else
-      ubld.group(1, 0).MOV(component(header, 2), addr);
-
-   elk_fs_reg data;
-   if (write) {
-      const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
-      data = retype(bld.move_to_vgrf(src, src_sz), ELK_REGISTER_TYPE_UD);
-   }
-
-   inst->opcode = ELK_SHADER_OPCODE_SEND;
-   inst->mlen = 1;
-   inst->header_size = 1;
-   inst->send_has_side_effects = has_side_effects;
-   inst->send_is_volatile = !has_side_effects;
-
-   inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
-
-   const uint32_t desc = elk_dp_oword_block_rw_desc(devinfo, align_16B,
-                                                    arg.ud, write);
-   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
-
-   inst->resize_sources(2);
-
-   inst->src[1] = header;
-}
-
 static void
 emit_fragment_mask(const fs_builder &bld, elk_fs_inst *inst)
 {
@ -1965,7 +1789,7 @@ lower_get_buffer_size(const fs_builder &bld, elk_fs_inst *inst)
   /* Since we can only execute this instruction on uniform bti/surface
    * handles, elk_fs_nir.cpp should already have limited this to SIMD8.
    */
-   assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
+   assert(inst->exec_size == 8);

   elk_fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
   elk_fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
@ -2087,11 +1911,6 @@ elk_fs_visitor::lower_logical_sends()
         lower_surface_logical_send(ibld, inst);
         break;

-      case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
-      case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
-         lower_surface_block_logical_send(ibld, inst);
-         break;
-
      case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
      case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
      case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
--- a/src/intel/compiler/elk/elk_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/elk/elk_nir_lower_cs_intrinsics.c
@ -321,38 +321,6 @@ elk_nir_lower_cs_intrinsics(nir_shader *nir,
      }
   }

-   if (devinfo->verx10 >= 125 && prog_data &&
-       nir->info.stage == MESA_SHADER_COMPUTE &&
-       nir->info.cs.derivative_group != DERIVATIVE_GROUP_QUADS &&
-       !nir->info.workgroup_size_variable &&
-       util_is_power_of_two_nonzero(nir->info.workgroup_size[0]) &&
-       util_is_power_of_two_nonzero(nir->info.workgroup_size[1])) {
-
-      state.hw_generated_local_id = true;
-
-      /* TODO: more heuristics about 1D/SLM access vs. 2D access */
-      bool linear =
-         BITSET_TEST(nir->info.system_values_read,
-                     SYSTEM_VALUE_LOCAL_INVOCATION_INDEX) ||
-         (nir->info.workgroup_size[1] == 1 &&
-          nir->info.workgroup_size[2] == 1) ||
-         (nir->info.num_images == 0 && nir->info.num_textures == 0);
-
-      prog_data->walk_order =
-         linear ? INTEL_WALK_ORDER_XYZ : INTEL_WALK_ORDER_YXZ;
-
-      /* nir_lower_compute_system_values will replace any references to
-       * SYSTEM_VALUE_LOCAL_INVOCATION_ID vector components with zero for
-       * any dimension where the workgroup size is 1, so we can skip
-       * generating those.  However, the hardware can only generate
-       * X, XY, or XYZ - it can't skip earlier components.
-       */
-      prog_data->generate_local_id =
-         (nir->info.workgroup_size[0] > 1 ? WRITEMASK_X   : 0) |
-         (nir->info.workgroup_size[1] > 1 ? WRITEMASK_XY  : 0) |
-         (nir->info.workgroup_size[2] > 1 ? WRITEMASK_XYZ : 0);
-   }
-
   nir_foreach_function_impl(impl, nir) {
      state.impl = impl;
      lower_cs_intrinsics_convert_impl(&state);