anv: add support for simple internal compute shaders

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Ivan Briano <ivan.briano@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23074>
2026-02-23 16:10:33 +01:00 · 2023-05-16 13:13:41 +03:00 · 2023-05-16 13:13:41 +03:00 · 7ca5c84804
commit 7ca5c84804
parent dbbcd5c32c
1 changed files with 154 additions and 4 deletions
--- a/src/intel/vulkan/genX_simple_shader.h
+++ b/src/intel/vulkan/genX_simple_shader.h
@ -305,7 +305,14 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state)
 static void
 genX(emit_simpler_shader_init_compute)(struct anv_simple_shader *state)
 {
-   unreachable("TODO");
+   assert(state->cmd_buffer->state.current_pipeline == GPGPU);
+
+#if GFX_VERx10 >= 125
+   struct anv_shader_bin *cs_bin = state->kernel;
+   const struct brw_cs_prog_data *prog_data =
+      (const struct brw_cs_prog_data *) cs_bin->prog_data;
+   genX(cmd_buffer_ensure_cfe_state)(state->cmd_buffer, prog_data->base.total_scratch);
+#endif
 }

 static void
@ -328,7 +335,12 @@ genX(simple_shader_alloc_push)(struct anv_simple_shader *state, uint32_t size)
                                                size,
                                                ANV_UBO_ALIGNMENT);
   } else {
-      unreachable("TODO");
+#if GFX_VERx10 >= 125
+      return anv_state_stream_alloc(&state->cmd_buffer->general_state_stream,
+                                    size, 64);
+#else
+      return anv_cmd_buffer_alloc_dynamic_state(state->cmd_buffer, size, 64);
+#endif
   }
 }

@ -341,7 +353,15 @@ genX(simple_shader_push_state_address)(struct anv_simple_shader *state,
         &state->cmd_buffer->device->dynamic_state_pool,
         push_state);
   } else {
-      unreachable("TODO");
+#if GFX_VERx10 >= 125
+      return anv_state_pool_state_address(
+         &state->cmd_buffer->device->general_state_pool,
+         push_state);
+#else
+      return anv_state_pool_state_address(
+         &state->cmd_buffer->device->dynamic_state_pool,
+         push_state);
+#endif
   }
 }

@ -438,7 +458,137 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state,
         prim.InstanceCount            = 1;
      }
   } else {
-      unreachable("TODO");
+      const struct intel_device_info *devinfo = device->info;
+      const struct brw_cs_prog_data *prog_data =
+         (const struct brw_cs_prog_data *) state->kernel->prog_data;
+      const struct brw_cs_dispatch_info dispatch =
+         brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
+
+#if GFX_VERx10 >= 125
+      anv_batch_emit(batch, GENX(COMPUTE_WALKER), cw) {
+         cw.SIMDSize                       = dispatch.simd_size / 16;
+         cw.IndirectDataStartAddress       = push_state.offset;
+         cw.IndirectDataLength             = push_state.alloc_size;
+         cw.LocalXMaximum                  = prog_data->local_size[0] - 1;
+         cw.LocalYMaximum                  = prog_data->local_size[1] - 1;
+         cw.LocalZMaximum                  = prog_data->local_size[2] - 1;
+         cw.ThreadGroupIDXDimension        = DIV_ROUND_UP(num_threads,
+                                                          dispatch.simd_size);
+         cw.ThreadGroupIDYDimension        = 1;
+         cw.ThreadGroupIDZDimension        = 1;
+         cw.ExecutionMask                  = dispatch.right_mask;
+         cw.PostSync.MOCS                  = anv_mocs(device, NULL, 0);
+
+         cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
+            .KernelStartPointer                = state->kernel->kernel.offset +
+                                                 brw_cs_prog_data_prog_offset(prog_data,
+                                                                              dispatch.simd_size),
+            .SamplerStatePointer               = 0,
+            .BindingTablePointer               = 0,
+            .BindingTableEntryCount            = 0,
+            .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
+            .SharedLocalMemorySize             = encode_slm_size(GFX_VER,
+                                                                 prog_data->base.total_shared),
+            .NumberOfBarriers                  = prog_data->uses_barrier,
+         };
+      }
+#else
+      const uint32_t vfe_curbe_allocation =
+         ALIGN(prog_data->push.per_thread.regs * dispatch.threads +
+               prog_data->push.cross_thread.regs, 2);
+
+      /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
+       *
+       *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
+       *     the only bits that are changed are scoreboard related: Scoreboard
+       *     Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
+       *     these scoreboard related states, a MEDIA_STATE_FLUSH is
+       *     sufficient."
+       */
+      genX(emit_apply_pipe_flushes)(batch, device, GPGPU, ANV_PIPE_CS_STALL_BIT,
+                                    &state->cmd_buffer->state.pending_query_bits);
+
+      anv_batch_emit(batch, GENX(MEDIA_VFE_STATE), vfe) {
+         vfe.StackSize              = 0;
+         vfe.MaximumNumberofThreads =
+            devinfo->max_cs_threads * devinfo->subslice_total - 1;
+         vfe.NumberofURBEntries     = 2;
+#if GFX_VER < 11
+         vfe.ResetGatewayTimer      = true;
+#endif
+         vfe.URBEntryAllocationSize = 2;
+         vfe.CURBEAllocationSize    = vfe_curbe_allocation;
+
+         if (prog_data->base.total_scratch) {
+            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
+             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
+             */
+            vfe.PerThreadScratchSpace =
+               ffs(prog_data->base.total_scratch) - 11;
+            vfe.ScratchSpaceBasePointer =
+               (struct anv_address) {
+               .bo = anv_scratch_pool_alloc(device,
+                                            &device->scratch_pool,
+                                            MESA_SHADER_COMPUTE,
+                                            prog_data->base.total_scratch),
+               .offset = 0,
+            };
+         }
+      }
+      struct anv_state iface_desc_state =
+         anv_cmd_buffer_alloc_dynamic_state(state->cmd_buffer,
+                                            GENX(INTERFACE_DESCRIPTOR_DATA_length) * 4,
+                                            64);
+      struct GENX(INTERFACE_DESCRIPTOR_DATA) iface_desc = {
+         .KernelStartPointer                    = state->kernel->kernel.offset +
+                                                  brw_cs_prog_data_prog_offset(prog_data,
+                                                                               dispatch.simd_size),
+
+         .SamplerCount                          = 0,
+         .BindingTableEntryCount                = 0,
+         .BarrierEnable                         = prog_data->uses_barrier,
+         .SharedLocalMemorySize                 = encode_slm_size(GFX_VER,
+                                                                  prog_data->base.total_shared),
+
+         .ConstantURBEntryReadOffset            = 0,
+         .ConstantURBEntryReadLength            = prog_data->push.per_thread.regs,
+         .CrossThreadConstantDataReadLength     = prog_data->push.cross_thread.regs,
+#if GFX_VER >= 12
+         /* TODO: Check if we are missing workarounds and enable mid-thread
+          * preemption.
+          *
+          * We still have issues with mid-thread preemption (it was already
+          * disabled by the kernel on gfx11, due to missing workarounds). It's
+          * possible that we are just missing some workarounds, and could
+          * enable it later, but for now let's disable it to fix a GPU in
+          * compute in Car Chase (and possibly more).
+          */
+         .ThreadPreemptionDisable               = true,
+#endif
+         .NumberofThreadsinGPGPUThreadGroup     = dispatch.threads,
+      };
+      GENX(INTERFACE_DESCRIPTOR_DATA_pack)(batch, iface_desc_state.map, &iface_desc);
+      anv_batch_emit(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
+         mid.InterfaceDescriptorTotalLength        = iface_desc_state.alloc_size;
+         mid.InterfaceDescriptorDataStartAddress   = iface_desc_state.offset;
+      }
+      anv_batch_emit(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
+         curbe.CURBEDataStartAddress = push_state.offset;
+         curbe.CURBETotalDataLength  = push_state.alloc_size;
+      }
+      anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
+         ggw.SIMDSize                     = dispatch.simd_size / 16;
+         ggw.ThreadDepthCounterMaximum    = 0;
+         ggw.ThreadHeightCounterMaximum   = 0;
+         ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
+         ggw.ThreadGroupIDXDimension      = DIV_ROUND_UP(num_threads,
+                                                         dispatch.simd_size);
+         ggw.ThreadGroupIDYDimension      = 1;
+         ggw.ThreadGroupIDZDimension      = 1;
+         ggw.RightExecutionMask           = dispatch.right_mask;
+         ggw.BottomExecutionMask          = 0xffffffff;
+      }
+#endif
   }
 }