intel/brw: Remove Gfx8- code from backend passes

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
2025-12-22 20:00:10 +01:00 · 2024-02-15 13:19:08 -08:00 · 2024-02-15 13:19:08 -08:00 · 7ac5696157
commit 7ac5696157
parent 9569ea82a8
10 changed files with 64 additions and 276 deletions
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@ -2341,10 +2341,10 @@ fs_visitor::dump_instruction_to_file(const backend_instruction *be_inst, FILE *f
   if (inst->conditional_mod) {
      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
      if (!inst->predicate &&
-          (devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+          (inst->opcode != BRW_OPCODE_SEL &&
-                                inst->opcode != BRW_OPCODE_CSEL &&
+           inst->opcode != BRW_OPCODE_CSEL &&
-                                inst->opcode != BRW_OPCODE_IF &&
+           inst->opcode != BRW_OPCODE_IF &&
-                                inst->opcode != BRW_OPCODE_WHILE))) {
+           inst->opcode != BRW_OPCODE_WHILE)) {
         fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
                 inst->flag_subreg % 2);
      }
--- a/src/intel/compiler/brw_fs_bank_conflicts.cpp
+++ b/src/intel/compiler/brw_fs_bank_conflicts.cpp
@ -549,8 +549,7 @@ namespace {
       * Register allocation ensures that, so don't move 127 around to avoid
       * breaking that property.
       */
-      if (v->devinfo->ver >= 8)
+      constrained[p.atom_of_reg(127)] = true;
         constrained[p.atom_of_reg(127)] = true;
      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
         /* Assume that anything referenced via fixed GRFs is baked into the
@ -567,24 +566,14 @@ namespace {
               constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
         }
         /* Preserve the original allocation of VGRFs used by the barycentric
          * source of the LINTERP instruction on Gfx6, since pair-aligned
          * barycentrics allow the PLN instruction to be used.
          */
         if (v->devinfo->has_pln && v->devinfo->ver <= 6 &&
             inst->opcode == FS_OPCODE_LINTERP)
            constrained[p.atom_of_reg(reg_of(inst->src[0]))] = true;
         /* The location of the Gfx7 MRF hack registers is hard-coded in the
          * rest of the compiler back-end.  Don't attempt to move them around.
          */
-         if (v->devinfo->ver >= 7) {
+         assert(inst->dst.file != MRF);
            assert(inst->dst.file != MRF);
-            for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
+         for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
-               const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
+            const unsigned reg = GFX7_MRF_HACK_START + inst->base_mrf + i;
-               constrained[p.atom_of_reg(reg)] = true;
+            constrained[p.atom_of_reg(reg)] = true;
            }
         }
      }
@ -600,10 +589,10 @@ namespace {
   is_conflict_optimized_out(const intel_device_info *devinfo,
                             const fs_inst *inst)
   {
-      return devinfo->ver >= 9 &&
+      return
-         ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
+         (is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
-                                    reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
+                                   reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
-          reg_of(inst->src[1]) == reg_of(inst->src[2]));
+          reg_of(inst->src[1]) == reg_of(inst->src[2]);
   }
   /**
@ -915,10 +904,6 @@ brw_fs_opt_bank_conflicts(fs_visitor &s)
   if (s.devinfo->ver >= 20)
      return false;
   /* No ternary instructions -- No bank conflicts. */
   if (s.devinfo->ver < 6)
      return false;
   const partitioning p = shader_reg_partitioning(&s);
   const bool *constrained = shader_reg_constraints(&s, p);
   const weight_vector_type *conflicts =
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@ -451,18 +451,10 @@ opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
                     break;
                  }
               } else if (scan_inst->conditional_mod == inst->conditional_mod) {
-                  /* On Gfx4 and Gfx5 sel.cond will dirty the flags, but the
+                  /* sel.cond will not write the flags. */
-                   * flags value is not based on the result stored in the
+                  assert(scan_inst->opcode != BRW_OPCODE_SEL);
-                   * destination.  On all other platforms sel.cond will not
+                  inst->remove(block, true);
-                   * write the flags, so execution will not get to this point.
+                  progress = true;
                   */
                  if (scan_inst->opcode == BRW_OPCODE_SEL) {
                     assert(devinfo->ver <= 5);
                  } else {
                     inst->remove(block, true);
                     progress = true;
                  }
                  break;
               } else if (!read_flag && scan_inst->can_do_cmod()) {
                  scan_inst->conditional_mod = inst->conditional_mod;
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@ -764,30 +764,6 @@ brw_combine_constants(struct value *candidates, unsigned num_candidates)
   return combine_constants_greedy(candidates, num_candidates);
 }
 /* Returns whether an instruction could co-issue if its immediate source were
 * replaced with a GRF source.
 */
 static bool
 could_coissue(const struct intel_device_info *devinfo, const fs_inst *inst)
 {
   assert(inst->opcode == BRW_OPCODE_MOV ||
          inst->opcode == BRW_OPCODE_CMP ||
          inst->opcode == BRW_OPCODE_ADD ||
          inst->opcode == BRW_OPCODE_MUL);
   if (devinfo->ver != 7)
      return false;
   /* Only float instructions can coissue.  We don't have a great
    * understanding of whether or not something like float(int(a) + int(b))
    * would be considered float (based on the destination type) or integer
    * (based on the source types), so we take the conservative choice of
    * only promoting when both destination and source are float.
    */
   return inst->dst.type == BRW_REGISTER_TYPE_F &&
          inst->src[0].type == BRW_REGISTER_TYPE_F;
 }
 /**
 * Box for storing fs_inst and some other necessary data
 *
@ -1346,12 +1322,6 @@ brw_fs_opt_combine_constants(fs_visitor &s)
            add_candidate_immediate(&table, inst, ip, 0, true, false, block,
                                    devinfo, const_ctx);
         }
         if (inst->src[1].file == IMM && devinfo->ver < 8) {
            add_candidate_immediate(&table, inst, ip, 1, true, false, block,
                                    devinfo, const_ctx);
         }
         break;
      case BRW_OPCODE_ADD3:
@ -1418,24 +1388,6 @@ brw_fs_opt_combine_constants(fs_visitor &s)
         }
         break;
      case BRW_OPCODE_MOV:
         if (could_coissue(devinfo, inst) && inst->src[0].file == IMM) {
            add_candidate_immediate(&table, inst, ip, 0, false, false, block,
                                    devinfo, const_ctx);
         }
         break;
      case BRW_OPCODE_CMP:
      case BRW_OPCODE_ADD:
      case BRW_OPCODE_MUL:
         assert(inst->src[0].file != IMM);
         if (could_coissue(devinfo, inst) && inst->src[1].file == IMM) {
            add_candidate_immediate(&table, inst, ip, 1, false, false, block,
                                    devinfo, const_ctx);
         }
         break;
      default:
         break;
      }
@ -1552,47 +1504,21 @@ brw_fs_opt_combine_constants(fs_visitor &s)
   if (s.cfg->num_blocks != 1)
      qsort(table.imm, table.len, sizeof(struct imm), compare);
-   if (devinfo->ver > 7) {
+   struct register_allocation *regs =
-      struct register_allocation *regs =
+      (struct register_allocation *) calloc(table.len, sizeof(regs[0]));
         (struct register_allocation *) calloc(table.len, sizeof(regs[0]));
-      for (int i = 0; i < table.len; i++) {
+   for (int i = 0; i < table.len; i++) {
-         regs[i].nr = UINT_MAX;
+      regs[i].nr = UINT_MAX;
-         regs[i].avail = 0xffff;
+      regs[i].avail = 0xffff;
      }
      foreach_block(block, s.cfg) {
         parcel_out_registers(table.imm, table.len, block, regs, table.len,
                              s.alloc, devinfo->ver);
      }
      free(regs);
   } else {
      fs_reg reg(VGRF, s.alloc.allocate(1));
      reg.stride = 0;
      for (int i = 0; i < table.len; i++) {
         struct imm *imm = &table.imm[i];
         /* Put the immediate in an offset aligned to its size. Some
          * instructions seem to have additional alignment requirements, so
          * account for that too.
          */
         reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
         /* Ensure we have enough space in the register to copy the immediate */
         if (reg.offset + imm->size > REG_SIZE) {
            reg.nr = s.alloc.allocate(1);
            reg.offset = 0;
         }
         imm->nr = reg.nr;
         imm->subreg_offset = reg.offset;
         reg.offset += imm->size;
      }
   }
   foreach_block(block, s.cfg) {
      parcel_out_registers(table.imm, table.len, block, regs, table.len,
                           s.alloc, devinfo->ver);
   }
   free(regs);
   bool rebuild_cfg = false;
   /* Insert MOVs to load the constant values into GRFs. */
@ -1661,7 +1587,7 @@ brw_fs_opt_combine_constants(fs_visitor &s)
       * replicating the single one we want. To avoid this, we always populate
       * both HF slots within a DWord with the constant.
       */
-      const uint32_t width = devinfo->ver == 8 && imm->is_half_float ? 2 : 1;
+      const uint32_t width = 1;
      const fs_builder ibld = fs_builder(&s, width).at(insert_block, n).exec_all();
      fs_reg reg(VGRF, imm->nr);
--- a/src/intel/compiler/brw_fs_copy_propagation.cpp
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@ -630,14 +630,8 @@ can_take_stride(fs_inst *inst, brw_reg_type dst_type,
    * are sends, so the sources are moved to MRF's and there are no
    * restrictions.
    */
-   if (inst->is_math()) {
+   if (inst->is_math())
-      if (devinfo->ver == 6 || devinfo->ver == 7) {
+      return stride == inst->dst.stride || stride == 0;
         assert(inst->dst.stride == 1);
         return stride == 1 || stride == 0;
      } else if (devinfo->ver >= 8) {
         return stride == inst->dst.stride || stride == 0;
      }
   }
   return true;
 }
@ -725,15 +719,6 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
      }
   }
   /* Avoid propagating odd-numbered FIXED_GRF registers into the first source
    * of a LINTERP instruction on platforms where the PLN instruction has
    * register alignment restrictions.
    */
   if (devinfo->has_pln && devinfo->ver <= 6 &&
       entry->src.file == FIXED_GRF && (entry->src.nr & 1) &&
       inst->opcode == FS_OPCODE_LINTERP && arg == 0)
      return false;
   /* we can't generally copy-propagate UD negations because we
    * can end up accessing the resulting values as signed integers
    * instead. See also resolve_ud_negate() and comment in
@ -750,8 +735,7 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
   /* Reject cases that would violate register regioning restrictions. */
   if ((entry->src.file == UNIFORM || !entry->src.is_contiguous()) &&
-       ((devinfo->ver == 6 && inst->is_math()) ||
+       (inst->is_send_from_grf() ||
        inst->is_send_from_grf() ||
        inst->uses_indirect_addressing())) {
      return false;
   }
@ -867,7 +851,7 @@ try_copy_propagate(const brw_compiler *compiler, fs_inst *inst,
        type_sz(entry->dst.type) != type_sz(inst->src[arg].type)))
      return false;
-   if (devinfo->ver >= 8 && (entry->src.negate || entry->src.abs) &&
+   if ((entry->src.negate || entry->src.abs) &&
       is_logic_op(inst->opcode)) {
      return false;
   }
@ -946,7 +930,6 @@ static bool
 try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
                       acp_entry *entry, int arg)
 {
   const struct intel_device_info *devinfo = compiler->devinfo;
   bool progress = false;
   if (type_sz(entry->src.type) > 4)
@ -1002,14 +985,14 @@ try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
   val.type = inst->src[arg].type;
   if (inst->src[arg].abs) {
-      if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+      if (is_logic_op(inst->opcode) ||
          !brw_abs_immediate(val.type, &val.as_brw_reg())) {
         return false;
      }
   }
   if (inst->src[arg].negate) {
-      if ((devinfo->ver >= 8 && is_logic_op(inst->opcode)) ||
+      if (is_logic_op(inst->opcode) ||
          !brw_negate_immediate(val.type, &val.as_brw_reg())) {
         return false;
      }
@ -1024,13 +1007,6 @@ try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
      break;
   case SHADER_OPCODE_POW:
      /* Allow constant propagation into src1 (except on Gen 6 which
       * doesn't support scalar source math), and let constant combining
       * promote the constant on Gen < 8.
       */
      if (devinfo->ver == 6)
         break;
      if (arg == 1) {
         inst->src[arg] = val;
         progress = true;
@ -1190,15 +1166,6 @@ try_constant_propagate(const brw_compiler *compiler, fs_inst *inst,
   case SHADER_OPCODE_INT_QUOTIENT:
   case SHADER_OPCODE_INT_REMAINDER:
      /* Allow constant propagation into either source (except on Gen 6
       * which doesn't support scalar source math). Constant combining
       * promote the src1 constant on Gen < 8, and it will promote the src0
       * constant on all platforms.
       */
      if (devinfo->ver == 6)
         break;
      FALLTHROUGH;
   case BRW_OPCODE_AND:
   case BRW_OPCODE_ASR:
   case BRW_OPCODE_BFE:
--- a/src/intel/compiler/brw_fs_lower.cpp
+++ b/src/intel/compiler/brw_fs_lower.cpp
@ -334,13 +334,12 @@ bool
 brw_fs_lower_barycentrics(fs_visitor &s)
 {
   const intel_device_info *devinfo = s.devinfo;
   const bool has_interleaved_layout = devinfo->has_pln ||
      (devinfo->ver >= 7 && devinfo->ver < 20);
   bool progress = false;
-   if (s.stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
+   if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
      return false;
   bool progress = false;
   foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
      if (inst->exec_size < 16)
         continue;
@ -461,9 +460,6 @@ brw_fs_lower_find_live_channel(fs_visitor &s)
 {
   bool progress = false;
   if (s.devinfo->ver < 8)
      return false;
   bool packed_dispatch =
      brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
                                    s.stage_prog_data);
--- a/src/intel/compiler/brw_fs_lower_integer_multiplication.cpp
+++ b/src/intel/compiler/brw_fs_lower_integer_multiplication.cpp
@ -150,23 +150,16 @@ brw_fs_lower_mul_dword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
    */
   if (inst->src[1].file == IMM &&
       (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
-      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+      /* The MUL instruction isn't commutative. On Gen >= 7 only
-       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+       * the low 16-bits of src1 are used.
       * src1 are used.
       *
       * If multiplying by an immediate value that fits in 16-bits, do a
       * single MUL instruction with that value in the proper location.
       */
      const bool ud = (inst->src[1].d >= 0);
-      if (devinfo->ver < 7) {
+      ibld.MUL(inst->dst, inst->src[0],
-         fs_reg imm(VGRF, s.alloc.allocate(s.dispatch_width / 8), inst->dst.type);
+               ud ? brw_imm_uw(inst->src[1].ud)
-         ibld.MOV(imm, inst->src[1]);
+                  : brw_imm_w(inst->src[1].d));
         ibld.MUL(inst->dst, imm, inst->src[0]);
      } else {
         ibld.MUL(inst->dst, inst->src[0],
                  ud ? brw_imm_uw(inst->src[1].ud)
                     : brw_imm_w(inst->src[1].d));
      }
   } else {
      /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
       * do 32-bit integer multiplication in one instruction, but instead
@ -239,7 +232,7 @@ brw_fs_lower_mul_dword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
      high.offset = inst->dst.offset % REG_SIZE;
      bool do_addition = true;
-      if (devinfo->ver >= 7) {
+      {
         /* From Wa_1604601757:
          *
          * "When multiplying a DW and any lower precision integer, source modifier
@ -294,14 +287,6 @@ brw_fs_lower_mul_dword_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
            ibld.MUL(high, inst->src[0],
                     subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
         }
      } else {
         if (inst->src[0].abs)
            lower_src_modifiers(&s, block, inst, 0);
         ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
                  inst->src[1]);
         ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
                  inst->src[1]);
      }
      if (do_addition) {
@ -399,7 +384,7 @@ brw_fs_lower_mulh_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
    *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
    *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
    */
-   if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
+   if (inst->src[1].negate || inst->src[1].abs)
      lower_src_modifiers(&s, block, inst, 1);
   /* Should have been lowered to 8-wide. */
@ -408,47 +393,23 @@ brw_fs_lower_mulh_inst(fs_visitor &s, fs_inst *inst, bblock_t *block)
   const fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), inst->dst.type),
                                inst->group % acc_width);
   fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
-   fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
+   ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
-   if (devinfo->ver >= 8) {
+   /* Until Gfx8, integer multiplies read 32-bits from one source,
-      /* Until Gfx8, integer multiplies read 32-bits from one source,
+    * and 16-bits from the other, and relying on the MACH instruction
-       * and 16-bits from the other, and relying on the MACH instruction
+    * to generate the high bits of the result.
-       * to generate the high bits of the result.
+    *
-       *
+    * On Gfx8, the multiply instruction does a full 32x32-bit
-       * On Gfx8, the multiply instruction does a full 32x32-bit
+    * multiply, but in order to do a 64-bit multiply we can simulate
-       * multiply, but in order to do a 64-bit multiply we can simulate
+    * the previous behavior and then use a MACH instruction.
-       * the previous behavior and then use a MACH instruction.
+    */
-       */
+   assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
-      assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+          mul->src[1].type == BRW_REGISTER_TYPE_UD);
-             mul->src[1].type == BRW_REGISTER_TYPE_UD);
+   mul->src[1].type = BRW_REGISTER_TYPE_UW;
-      mul->src[1].type = BRW_REGISTER_TYPE_UW;
+   mul->src[1].stride *= 2;
      mul->src[1].stride *= 2;
-      if (mul->src[1].file == IMM) {
+   if (mul->src[1].file == IMM) {
-         mul->src[1] = brw_imm_uw(mul->src[1].ud);
+      mul->src[1] = brw_imm_uw(mul->src[1].ud);
      }
   } else if (devinfo->verx10 == 70 &&
              inst->group > 0) {
      /* Among other things the quarter control bits influence which
       * accumulator register is used by the hardware for instructions
       * that access the accumulator implicitly (e.g. MACH).  A
       * second-half instruction would normally map to acc1, which
       * doesn't exist on Gfx7 and up (the hardware does emulate it for
       * floating-point instructions *only* by taking advantage of the
       * extra precision of acc0 not normally used for floating point
       * arithmetic).
       *
       * HSW and up are careful enough not to try to access an
       * accumulator register that doesn't exist, but on earlier Gfx7
       * hardware we need to make sure that the quarter control bits are
       * zero to avoid non-deterministic behaviour and emit an extra MOV
       * to get the result masked correctly according to the current
       * channel enables.
       */
      mach->group = 0;
      mach->force_writemask_all = true;
      mach->dst = ibld.vgrf(inst->dst.type);
      ibld.MOV(inst->dst, mach->dst);
   }
 }
@ -463,13 +424,8 @@ brw_fs_lower_integer_multiplication(fs_visitor &s)
         /* If the instruction is already in a form that does not need lowering,
          * return early.
          */
-         if (s.devinfo->ver >= 7) {
+         if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
-            if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
+            continue;
               continue;
         } else {
            if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
               continue;
         }
         if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
              inst->dst.type == BRW_REGISTER_TYPE_UQ) &&
--- a/src/intel/compiler/brw_fs_lower_pack.cpp
+++ b/src/intel/compiler/brw_fs_lower_pack.cpp
@ -64,13 +64,6 @@ brw_fs_lower_pack(fs_visitor &s)
               const uint32_t half = _mesa_float_to_half(inst->src[i].f);
               ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, i),
                        brw_imm_uw(half));
            } else if (i == 1 && s.devinfo->ver < 9) {
               /* Pre-Skylake requires DWord aligned destinations */
               fs_reg tmp = ibld.vgrf(BRW_REGISTER_TYPE_UD);
               ibld.F32TO16(subscript(tmp, BRW_REGISTER_TYPE_HF, 0),
                            inst->src[i]);
               ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, 1),
                        subscript(tmp, BRW_REGISTER_TYPE_UW, 0));
            } else {
               ibld.F32TO16(subscript(dst, BRW_REGISTER_TYPE_HF, i),
                            inst->src[i]);
--- a/src/intel/compiler/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@ -184,7 +184,6 @@ namespace {
          * support 64-bit types at all.
          */
         if ((!has_64bit || devinfo->verx10 >= 125 ||
              devinfo->platform == INTEL_PLATFORM_CHV ||
              intel_device_info_is_9lp(devinfo)) && type_sz(t) > 4)
            return BRW_REGISTER_TYPE_UD;
         else
@ -192,9 +191,7 @@ namespace {
      case SHADER_OPCODE_BROADCAST:
      case SHADER_OPCODE_MOV_INDIRECT:
-         if (((devinfo->verx10 == 70 ||
+         if (((intel_device_info_is_9lp(devinfo) ||
               devinfo->platform == INTEL_PLATFORM_CHV ||
               intel_device_info_is_9lp(devinfo) ||
               devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
             (devinfo->verx10 >= 125 &&
              brw_reg_type_is_floating_point(inst->src[0].type)))
@ -258,24 +255,6 @@ namespace {
         return false;
      }
      /* Empirical testing shows that Broadwell has a bug affecting half-float
       * MAD instructions when any of its sources has a non-zero offset, such
       * as:
       *
       * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
       *
       * We used to generate code like this for SIMD8 executions where we
       * used to pack components Y and W of a vector at offset 16B of a SIMD
       * register. The problem doesn't occur if the stride of the source is 0.
       */
      if (devinfo->ver == 8 &&
          inst->opcode == BRW_OPCODE_MAD &&
          inst->src[i].type == BRW_REGISTER_TYPE_HF &&
          reg_offset(inst->src[i]) % REG_SIZE > 0 &&
          inst->src[i].stride != 0) {
         return true;
      }
      const unsigned dst_byte_offset = reg_offset(inst->dst) % (reg_unit(devinfo) * REG_SIZE);
      const unsigned src_byte_offset = reg_offset(inst->src[i]) % (reg_unit(devinfo) * REG_SIZE);
--- a/src/intel/compiler/brw_fs_opt.cpp
+++ b/src/intel/compiler/brw_fs_opt.cpp
@ -190,9 +190,6 @@ load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
 bool
 brw_fs_opt_zero_samples(fs_visitor &s)
 {
   /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
   assert(s.devinfo->ver >= 7);
   bool progress = false;
   foreach_block_and_inst(block, fs_inst, send, s.cfg) {
@ -268,9 +265,6 @@ brw_fs_opt_zero_samples(fs_visitor &s)
 bool
 brw_fs_opt_split_sends(fs_visitor &s)
 {
   if (s.devinfo->ver < 9)
      return false;
   bool progress = false;
   foreach_block_and_inst(block, fs_inst, send, s.cfg) {