intel/brw: Lower MEMORY_OPCODE_*_LOGICAL to HDC messages

This is more complicated. We map the MEMORY_*_LOGICAL opcodes to the older HDC messages: typed and untyped surface read/write/atomic (whether float or integer), DWord and Byte scattered messages, OWord block, and both A64, BTI, and stateless messages. - MEMORY_MODE_* is used to select stateless-scratch, typed, or untyped. - MEMORY_FLAG_TRANSPOSE is used to select block access. - MEMORY_BINDING_TYPE = FLAT and 64-bit address size selects A64. - Alignment and data type size select between byte/dword scattered or surface messages. While we may not be able to handle the full generality of message possibilities, we can handle everything we generate currently. The plan here is to assert/validate that we don't generate MEMORY_*_LOGICAL ops on HDC-based platforms which can't support those particular messages. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Acked-by: Rohan Garg <rohan.garg@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2026-03-05 05:50:29 +01:00 · 2024-08-06 15:36:52 -07:00 · 2024-08-06 15:36:52 -07:00 · dc4770b005
commit dc4770b005
parent 3255c9cc49
1 changed files with 283 additions and 1 deletions
--- a/src/intel/compiler/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw_lower_logical_sends.cpp
@ -1567,10 +1567,292 @@ lower_lsc_memory_logical_send(const fs_builder &bld, fs_inst *inst)
   inst->src[3] = payload2;
 }

+static brw_reg
+emit_a64_oword_block_header(const fs_builder &bld, const brw_reg &addr);
+
 static void
 lower_hdc_memory_logical_send(const fs_builder &bld, fs_inst *inst)
 {
-   unreachable("Not implemented yet");
+   const intel_device_info *devinfo = bld.shader->devinfo;
+   const brw_compiler *compiler = bld.shader->compiler;
+
+   assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
+   assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
+   assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
+   assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
+   assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
+   assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);
+
+   /* Get the logical send arguments. */
+   const enum lsc_opcode op = (lsc_opcode)inst->src[MEMORY_LOGICAL_OPCODE].ud;
+   const enum memory_logical_mode mode =
+      (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
+   enum lsc_addr_surface_type binding_type =
+      (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
+   brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
+   const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
+   const unsigned coord_components =
+      inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
+   const unsigned alignment = inst->src[MEMORY_LOGICAL_ALIGNMENT].ud;
+   const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
+   const enum memory_flags flags =
+      (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
+   const bool block = flags & MEMORY_FLAG_TRANSPOSE;
+   const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
+   const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
+   const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
+   const bool has_side_effects = inst->has_side_effects();
+   const bool has_dest = inst->dst.file != BAD_FILE && !inst->dst.is_null();
+
+   /* Don't predicate scratch writes on the sample mask.  Otherwise,
+    * FS helper invocations would load undefined values from scratch memory.
+    * And scratch memory load/stores are produced from operations without
+    * side-effects, thus they should not have different behavior in the
+    * helper invocations.
+    */
+   bool allow_sample_mask = has_side_effects && mode != MEMORY_MODE_SCRATCH;
+
+   const enum lsc_data_size data_size =
+      (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
+
+   /* unpadded data size */
+   const uint32_t data_bit_size =
+      data_size == LSC_DATA_SIZE_D8U32 ? 8 :
+      data_size == LSC_DATA_SIZE_D16U32 ? 16 :
+      8 * lsc_data_size_bytes(data_size);
+
+   const bool byte_scattered =
+      data_bit_size < 32 || (alignment != 0 && alignment < 4);
+   const bool dword_scattered = !byte_scattered && mode == MEMORY_MODE_SCRATCH;
+   const bool surface_access = !byte_scattered && !dword_scattered && !block;
+
+   /* SLM block reads must use the 16B-aligned OWord Block Read messages,
+    * as the unaligned message doesn't exist for SLM.
+    */
+   const bool oword_aligned = block && mode == MEMORY_MODE_SHARED_LOCAL;
+   assert(!oword_aligned || (alignment % 16) == 0);
+
+   enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
+   unsigned addr_size_B = coord_components * lsc_addr_size_bytes(addr_size);
+
+   brw_reg header;
+   fs_builder ubld8 = bld.exec_all().group(8, 0);
+   fs_builder ubld1 = ubld8.group(1, 0);
+   if (mode == MEMORY_MODE_SCRATCH) {
+      header = ubld8.vgrf(BRW_TYPE_UD);
+      ubld8.emit(SHADER_OPCODE_SCRATCH_HEADER, header, brw_ud8_grf(0, 0));
+   } else if (block) {
+      if (addr_size == LSC_ADDR_SIZE_A64) {
+         header = emit_a64_oword_block_header(bld, addr);
+      } else {
+         header = ubld8.vgrf(BRW_TYPE_UD);
+         ubld8.MOV(header, brw_imm_ud(0));
+         if (oword_aligned)
+            ubld1.SHR(component(header, 2), addr, brw_imm_ud(4));
+         else
+            ubld1.MOV(component(header, 2), addr);
+      }
+   }
+
+   /* If we're a fragment shader, we have to predicate with the sample mask to
+    * avoid helper invocations to avoid helper invocations in instructions
+    * with side effects, unless they are explicitly required.
+    *
+    * There are also special cases when we actually want to run on helpers
+    * (ray queries).
+    */
+   if (bld.shader->stage == MESA_SHADER_FRAGMENT) {
+      if (include_helpers)
+         emit_predicate_on_vector_mask(bld, inst);
+      else if (allow_sample_mask &&
+               (header.file == BAD_FILE || !surface_access))
+         brw_emit_predicate_on_sample_mask(bld, inst);
+   }
+
+   brw_reg payload, payload2;
+   unsigned mlen, ex_mlen = 0;
+
+   if (!block) {
+      brw_reg data[11];
+      unsigned num_sources = 0;
+      if (header.file != BAD_FILE)
+         data[num_sources++] = header;
+
+      for (unsigned i = 0; i < coord_components; i++)
+         data[num_sources++] = offset(addr, inst->exec_size, i);
+
+      if (data0.file != BAD_FILE) {
+         for (unsigned i = 0; i < components; i++)
+            data[num_sources++] = offset(data0, inst->exec_size, i);
+         if (data1.file != BAD_FILE) {
+            for (unsigned i = 0; i < components; i++)
+               data[num_sources++] = offset(data1, inst->exec_size, i);
+         }
+      }
+
+      assert(num_sources <= ARRAY_SIZE(data));
+
+      unsigned payload_size_UDs = (header.file != BAD_FILE ? 1 : 0) +
+                                  (addr_size_B / 4) +
+                                  (lsc_op_num_data_values(op) * components *
+                                   lsc_data_size_bytes(data_size) / 4);
+
+      payload = bld.vgrf(BRW_TYPE_UD, payload_size_UDs);
+      fs_inst *load_payload =
+         emit_load_payload_with_padding(bld, payload, data, num_sources,
+                                        header.file != BAD_FILE ? 1 : 0,
+                                        REG_SIZE);
+      mlen = load_payload->size_written / REG_SIZE;
+   } else {
+      assert(data1.file == BAD_FILE);
+
+      payload = header;
+      mlen = 1;
+
+      if (data0.file != BAD_FILE) {
+         payload2 = bld.move_to_vgrf(data0, components);
+         ex_mlen = components * sizeof(uint32_t) / REG_SIZE;
+      }
+   }
+
+
+   if (mode == MEMORY_MODE_SHARED_LOCAL) {
+      binding_type = LSC_ADDR_SURFTYPE_BTI;
+      binding = brw_imm_ud(GFX7_BTI_SLM);
+   } else if (mode == MEMORY_MODE_SCRATCH) {
+      binding_type = LSC_ADDR_SURFTYPE_BTI;
+      binding = brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
+   }
+
+   uint32_t sfid, desc;
+   if (mode == MEMORY_MODE_TYPED) {
+      assert(addr_size == LSC_ADDR_SIZE_A32);
+      assert(!block);
+
+      sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
+
+      if (lsc_opcode_is_atomic(op)) {
+         desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
+                                         lsc_op_to_legacy_atomic(op),
+                                         has_dest);
+      } else {
+         desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size,
+                                             inst->group, components, !has_dest);
+      }
+   } else if (addr_size == LSC_ADDR_SIZE_A64) {
+      assert(binding_type == LSC_ADDR_SURFTYPE_FLAT);
+      assert(!dword_scattered);
+
+      sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
+
+      if (lsc_opcode_is_atomic(op)) {
+         unsigned aop = lsc_op_to_legacy_atomic(op);
+         if (lsc_opcode_is_atomic_float(op)) {
+            desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
+                                                        data_bit_size, aop,
+                                                        has_dest);
+         } else {
+            desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
+                                                  data_bit_size, aop,
+                                                  has_dest);
+         }
+      } else if (block) {
+         desc = brw_dp_a64_oword_block_rw_desc(devinfo, oword_aligned,
+                                               components, !has_dest);
+      } else if (byte_scattered) {
+         desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
+                                                  data_bit_size, !has_dest);
+      } else {
+         desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
+                                                   components, !has_dest);
+      }
+   } else {
+      assert(binding_type != LSC_ADDR_SURFTYPE_FLAT);
+
+      sfid = surface_access ? HSW_SFID_DATAPORT_DATA_CACHE_1
+                            : GFX7_SFID_DATAPORT_DATA_CACHE;
+
+      if (lsc_opcode_is_atomic(op)) {
+         unsigned aop = lsc_op_to_legacy_atomic(op);
+         if (lsc_opcode_is_atomic_float(op)) {
+            desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
+                                                    aop, has_dest);
+         } else {
+            desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
+                                              aop, has_dest);
+         }
+      } else if (block) {
+         desc = brw_dp_oword_block_rw_desc(devinfo, oword_aligned,
+                                           components, !has_dest);
+      } else if (byte_scattered) {
+         desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
+                                              data_bit_size, !has_dest);
+      } else if (dword_scattered) {
+         desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
+                                               !has_dest);
+      } else {
+         desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
+                                               components, !has_dest);
+      }
+   }
+
+   assert(sfid);
+
+   /* Update the original instruction. */
+   inst->opcode = SHADER_OPCODE_SEND;
+   inst->sfid = sfid;
+   inst->mlen = mlen;
+   inst->ex_mlen = ex_mlen;
+   inst->header_size = header.file != BAD_FILE ? 1 : 0;
+   inst->send_has_side_effects = has_side_effects;
+   inst->send_is_volatile = !has_side_effects;
+
+   if (block) {
+      assert(inst->force_writemask_all);
+      inst->exec_size = components > 8 ? 16 : 8;
+   }
+
+   inst->resize_sources(4);
+
+   /* Set up descriptors */
+   switch (binding_type) {
+   case LSC_ADDR_SURFTYPE_FLAT:
+      inst->src[0] = brw_imm_ud(0);
+      inst->src[1] = brw_imm_ud(0);
+      break;
+   case LSC_ADDR_SURFTYPE_BSS:
+      inst->send_ex_bso = compiler->extended_bindless_surface_offset;
+      /* fall-through */
+   case LSC_ADDR_SURFTYPE_SS:
+      desc |= GFX9_BTI_BINDLESS;
+
+      /* We assume that the driver provided the handle in the top 20 bits so
+       * we can use the surface handle directly as the extended descriptor.
+       */
+      inst->src[0] = brw_imm_ud(0);
+      inst->src[1] = binding;
+      break;
+   case LSC_ADDR_SURFTYPE_BTI:
+      if (binding.file == IMM) {
+         desc |= binding.ud & 0xff;
+         inst->src[0] = brw_imm_ud(0);
+         inst->src[1] = brw_imm_ud(0);
+      } else {
+         brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
+         ubld1.AND(tmp, binding, brw_imm_ud(0xff));
+         inst->src[0] = component(tmp, 0);
+         inst->src[1] = brw_imm_ud(0);
+      }
+      break;
+   default:
+      unreachable("Unknown surface type");
+   }
+
+   inst->desc = desc;
+
+   /* Finally, the payloads */
+   inst->src[2] = payload;
+   inst->src[3] = payload2;
 }

 static void