turnip: Add support for compute shaders.

Since compute shares the FS state with graphics, we have to re-upload the pipeline state when switching between compute dispatch and graphics draws. We could potentially expose graphics and compute as separate queues and then we wouldn't need pipeline state management, but the closed driver exposes a single queue and consistency with them is probably good. So far I'm emitting texture/ibo state as IBs that we jump to. This is kind of silly when we could just emit it directly in our CS, but that's a refactor we can do later. Reviewed-by: Jonathan Marek <jonathan@marek.ca>
2026-05-05 07:28:11 +02:00 · 2019-11-26 20:37:19 -08:00 · 2019-11-26 20:37:19 -08:00 · c3efeac4c6
commit c3efeac4c6
parent ccf8230547
3 changed files with 332 additions and 28 deletions
--- a/src/freedreno/vulkan/tu_cmd_buffer.c
+++ b/src/freedreno/vulkan/tu_cmd_buffer.c
@ -1809,7 +1809,8 @@ tu_CmdBindPipeline(VkCommandBuffer commandBuffer,
      cmd->state.dirty |= TU_CMD_DIRTY_PIPELINE;
      break;
   case VK_PIPELINE_BIND_POINT_COMPUTE:
-      tu_finishme("binding compute pipeline");
+      cmd->state.compute_pipeline = pipeline;
+      cmd->state.dirty |= TU_CMD_DIRTY_COMPUTE_PIPELINE;
      break;
   default:
      unreachable("unrecognized pipeline bind point");
@ -2557,13 +2558,17 @@ tu6_emit_ibo(struct tu_device *device, struct tu_cs *draw_state,
   /* emit texture state: */
   tu_cs_emit_pkt7(&cs, CP_LOAD_STATE6, 3);
   tu_cs_emit(&cs, CP_LOAD_STATE6_0_DST_OFF(0) |
-              CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
+              CP_LOAD_STATE6_0_STATE_TYPE(type == MESA_SHADER_COMPUTE ?
+                                          ST6_IBO : ST6_SHADER) |
              CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
-              CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) |
+              CP_LOAD_STATE6_0_STATE_BLOCK(type == MESA_SHADER_COMPUTE ?
+                                           SB6_CS_SHADER : SB6_IBO) |
              CP_LOAD_STATE6_0_NUM_UNIT(link->image_mapping.num_ibo));
   tu_cs_emit_qw(&cs, ibo_addr); /* SRC_ADDR_LO/HI */

-   tu_cs_emit_pkt4(&cs, REG_A6XX_SP_IBO_LO, 2);
+   tu_cs_emit_pkt4(&cs,
+                   type == MESA_SHADER_COMPUTE ?
+                   REG_A6XX_SP_IBO_LO : REG_A6XX_SP_CS_IBO_LO, 2);
   tu_cs_emit_qw(&cs, ibo_addr); /* SRC_ADDR_LO/HI */

   return tu_cs_end_sub_stream(draw_state, &cs);
@ -2806,7 +2811,11 @@ tu6_bind_draw_states(struct tu_cmd_buffer *cmd,
            }
      }
   }
-   cmd->state.dirty = 0;
+
+   /* Fragment shader state overwrites compute shader state, so flag the
+    * compute pipeline for re-emit.
+    */
+   cmd->state.dirty = TU_CMD_DIRTY_COMPUTE_PIPELINE;
 }

 static void
@ -2989,9 +2998,156 @@ struct tu_dispatch_info
 };

 static void
-tu_dispatch(struct tu_cmd_buffer *cmd_buffer,
+tu_emit_compute_driver_params(struct tu_cs *cs, struct tu_pipeline *pipeline,
+                              const struct tu_dispatch_info *info)
+{
+   gl_shader_stage type = MESA_SHADER_COMPUTE;
+   const struct tu_program_descriptor_linkage *link =
+      &pipeline->program.link[type];
+   const struct ir3_const_state *const_state = &link->const_state;
+   uint32_t offset_dwords = const_state->offsets.driver_param;
+
+   if (link->constlen <= offset_dwords)
+      return;
+
+   if (!info->indirect) {
+      uint32_t driver_params[] = {
+         info->blocks[0],
+         info->blocks[1],
+         info->blocks[2],
+         pipeline->compute.local_size[0],
+         pipeline->compute.local_size[1],
+         pipeline->compute.local_size[2],
+      };
+      uint32_t num_consts = MIN2(const_state->num_driver_params,
+                                 link->constlen - offset_dwords);
+      uint32_t align_size = align(num_consts, 4);
+
+      /* push constants */
+      tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + align_size);
+      tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset_dwords / 4) |
+                 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+                 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+                 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
+                 CP_LOAD_STATE6_0_NUM_UNIT(align_size / 4));
+      tu_cs_emit(cs, 0);
+      tu_cs_emit(cs, 0);
+      uint32_t i;
+      for (i = 0; i < num_consts; i++)
+         tu_cs_emit(cs, driver_params[i]);
+      for (; i < align_size; i++)
+         tu_cs_emit(cs, 0);
+   } else {
+      tu_finishme("Indirect driver params");
+   }
+}
+
+static void
+tu_dispatch(struct tu_cmd_buffer *cmd,
            const struct tu_dispatch_info *info)
 {
+   struct tu_cs *cs = &cmd->cs;
+   struct tu_pipeline *pipeline = cmd->state.compute_pipeline;
+   struct tu_descriptor_state *descriptors_state =
+      &cmd->descriptors[VK_PIPELINE_BIND_POINT_COMPUTE];
+
+   VkResult result = tu_cs_reserve_space(cmd->device, cs, 256);
+   if (result != VK_SUCCESS) {
+      cmd->record_result = result;
+      return;
+   }
+
+   if (cmd->state.dirty & TU_CMD_DIRTY_COMPUTE_PIPELINE)
+      tu_cs_emit_ib(cs, &pipeline->program.state_ib);
+
+   struct tu_cs_entry ib;
+
+   ib = tu6_emit_consts(cmd, pipeline, descriptors_state, MESA_SHADER_COMPUTE);
+   if (ib.size)
+      tu_cs_emit_ib(cs, &ib);
+
+   tu_emit_compute_driver_params(cs, pipeline, info);
+
+   bool needs_border;
+   ib = tu6_emit_textures(cmd->device, &cmd->draw_state, pipeline,
+                          descriptors_state, MESA_SHADER_COMPUTE,
+                          &needs_border);
+   if (ib.size)
+      tu_cs_emit_ib(cs, &ib);
+
+   if (needs_border)
+      tu6_emit_border_color(cmd, cs);
+
+   ib = tu6_emit_ibo(cmd->device, &cmd->draw_state, pipeline,
+                     descriptors_state, MESA_SHADER_COMPUTE);
+   if (ib.size)
+      tu_cs_emit_ib(cs, &ib);
+
+   /* track BOs */
+   if (cmd->state.dirty & TU_CMD_DIRTY_DESCRIPTOR_SETS) {
+      unsigned i;
+      for_each_bit(i, descriptors_state->valid) {
+         struct tu_descriptor_set *set = descriptors_state->sets[i];
+         for (unsigned j = 0; j < set->layout->buffer_count; ++j)
+            if (set->descriptors[j]) {
+               tu_bo_list_add(&cmd->bo_list, set->descriptors[j],
+                              MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
+            }
+      }
+   }
+
+   /* Compute shader state overwrites fragment shader state, so we flag the
+    * graphics pipeline for re-emit.
+    */
+   cmd->state.dirty = TU_CMD_DIRTY_PIPELINE;
+
+   tu_cs_emit_pkt7(cs, CP_SET_MARKER, 1);
+   tu_cs_emit(cs, A6XX_CP_SET_MARKER_0_MODE(0x8));
+
+   const uint32_t *local_size = pipeline->compute.local_size;
+   const uint32_t *num_groups = info->blocks;
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_NDRANGE_0, 7);
+   tu_cs_emit(cs,
+              A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(3) |
+              A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) |
+              A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) |
+              A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1));
+   tu_cs_emit(cs, A6XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0]));
+   tu_cs_emit(cs, 0);            /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */
+   tu_cs_emit(cs, A6XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1]));
+   tu_cs_emit(cs, 0);            /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */
+   tu_cs_emit(cs, A6XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2]));
+   tu_cs_emit(cs, 0);            /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 3);
+   tu_cs_emit(cs, 1);            /* HLSQ_CS_KERNEL_GROUP_X */
+   tu_cs_emit(cs, 1);            /* HLSQ_CS_KERNEL_GROUP_Y */
+   tu_cs_emit(cs, 1);            /* HLSQ_CS_KERNEL_GROUP_Z */
+
+   if (info->indirect) {
+      uint64_t iova = tu_buffer_iova(info->indirect) + info->indirect_offset;
+
+      tu_bo_list_add(&cmd->bo_list, info->indirect->bo,
+                     MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE);
+
+      tu_cs_emit_pkt7(cs, CP_EXEC_CS_INDIRECT, 4);
+      tu_cs_emit(cs, 0x00000000);
+      tu_cs_emit_qw(cs, iova);
+      tu_cs_emit(cs,
+                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) |
+                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) |
+                 A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1));
+   } else {
+      tu_cs_emit_pkt7(cs, CP_EXEC_CS, 4);
+      tu_cs_emit(cs, 0x00000000);
+      tu_cs_emit(cs, CP_EXEC_CS_1_NGROUPS_X(info->blocks[0]));
+      tu_cs_emit(cs, CP_EXEC_CS_2_NGROUPS_Y(info->blocks[1]));
+      tu_cs_emit(cs, CP_EXEC_CS_3_NGROUPS_Z(info->blocks[2]));
+   }
+
+   tu_cs_emit_wfi(cs);
+
+   tu6_emit_cache_flush(cmd, cs);
 }

 void
--- a/src/freedreno/vulkan/tu_pipeline.c
+++ b/src/freedreno/vulkan/tu_pipeline.c
@ -476,6 +476,52 @@ tu6_emit_fs_config(struct tu_cs *cs, const struct ir3_shader_variant *fs)
   tu_cs_emit(cs, fs->image_mapping.num_ibo);
 }

+static void
+tu6_emit_cs_config(struct tu_cs *cs, const struct ir3_shader_variant *v)
+{
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_UPDATE_CNTL, 1);
+   tu_cs_emit(cs, 0xff);
+
+   unsigned constlen = align(v->constlen, 4);
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL, 1);
+   tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_CONSTLEN(constlen) |
+              A6XX_HLSQ_CS_CNTL_ENABLED);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CONFIG, 2);
+   tu_cs_emit(cs, A6XX_SP_CS_CONFIG_ENABLED |
+              A6XX_SP_CS_CONFIG_NIBO(v->image_mapping.num_ibo) |
+              A6XX_SP_CS_CONFIG_NTEX(v->num_samp) |
+              A6XX_SP_CS_CONFIG_NSAMP(v->num_samp) |
+              A6XX_SP_CS_CONFIG_NIBO(v->image_mapping.num_ibo));
+   tu_cs_emit(cs, v->instrlen);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CTRL_REG0, 1);
+   tu_cs_emit(cs, A6XX_SP_CS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
+              A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(v->info.max_reg + 1) |
+              A6XX_SP_CS_CTRL_REG0_MERGEDREGS |
+              A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack) |
+              COND(v->need_pixlod, A6XX_SP_CS_CTRL_REG0_PIXLODENABLE));
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
+   tu_cs_emit(cs, 0x41);
+
+   uint32_t local_invocation_id =
+      ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
+   uint32_t work_group_id =
+      ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID);
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
+   tu_cs_emit(cs,
+              A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
+              A6XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) |
+              A6XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) |
+              A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
+   tu_cs_emit(cs, 0x2fc);             /* HLSQ_CS_UNKNOWN_B998 */
+
+   tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_IBO_COUNT, 1);
+   tu_cs_emit(cs, v->image_mapping.num_ibo);
+}
+
 static void
 tu6_emit_vs_system_values(struct tu_cs *cs,
                          const struct ir3_shader_variant *vs)
@ -1441,13 +1487,12 @@ tu6_emit_blend_constants(struct tu_cs *cs, const float constants[4])
 }

 static VkResult
-tu_pipeline_builder_create_pipeline(struct tu_pipeline_builder *builder,
-                                    struct tu_pipeline **out_pipeline)
+tu_pipeline_create(struct tu_device *dev,
+                   const VkAllocationCallbacks *pAllocator,
+                   struct tu_pipeline **out_pipeline)
 {
-   struct tu_device *dev = builder->device;
-
   struct tu_pipeline *pipeline =
-      vk_zalloc2(&dev->alloc, builder->alloc, sizeof(*pipeline), 8,
+      vk_zalloc2(&dev->alloc, pAllocator, sizeof(*pipeline), 8,
                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
   if (!pipeline)
      return VK_ERROR_OUT_OF_HOST_MEMORY;
@ -1457,7 +1502,7 @@ tu_pipeline_builder_create_pipeline(struct tu_pipeline_builder *builder,
   /* reserve the space now such that tu_cs_begin_sub_stream never fails */
   VkResult result = tu_cs_reserve_space(dev, &pipeline->cs, 2048);
   if (result != VK_SUCCESS) {
-      vk_free2(&dev->alloc, builder->alloc, pipeline);
+      vk_free2(&dev->alloc, pAllocator, pipeline);
      return result;
   }

@ -1813,7 +1858,8 @@ static VkResult
 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
                          struct tu_pipeline **pipeline)
 {
-   VkResult result = tu_pipeline_builder_create_pipeline(builder, pipeline);
+   VkResult result = tu_pipeline_create(builder->device, builder->alloc,
+                                        pipeline);
   if (result != VK_SUCCESS)
      return result;

@ -1949,38 +1995,133 @@ tu_CreateGraphicsPipelines(VkDevice device,
   return final_result;
 }

+static void
+tu6_emit_compute_program(struct tu_cs *cs,
+                         struct tu_shader *shader,
+                         const struct tu_bo *binary_bo)
+{
+   const struct ir3_shader_variant *v = &shader->variants[0];
+
+   tu6_emit_cs_config(cs, v);
+
+   /* The compute program is the only one in the pipeline, so 0 offset. */
+   tu6_emit_shader_object(cs, MESA_SHADER_COMPUTE, v, binary_bo, 0);
+
+   tu6_emit_immediates(cs, v, CP_LOAD_STATE6_FRAG, SB6_CS_SHADER);
+}
+
 static VkResult
-tu_compute_pipeline_create(VkDevice _device,
+tu_compute_upload_shader(VkDevice device,
+                         struct tu_pipeline *pipeline,
+                         struct tu_shader *shader)
+{
+   TU_FROM_HANDLE(tu_device, dev, device);
+   struct tu_bo *bo = &pipeline->program.binary_bo;
+   struct ir3_shader_variant *v = &shader->variants[0];
+
+   uint32_t shader_size = sizeof(uint32_t) * v->info.sizedwords;
+   VkResult result =
+      tu_bo_init_new(dev, bo, shader_size);
+   if (result != VK_SUCCESS)
+      return result;
+
+   result = tu_bo_map(dev, bo);
+   if (result != VK_SUCCESS)
+      return result;
+
+   memcpy(bo->map, shader->binary, shader_size);
+
+   return VK_SUCCESS;
+}
+
+
+static VkResult
+tu_compute_pipeline_create(VkDevice device,
                           VkPipelineCache _cache,
                           const VkComputePipelineCreateInfo *pCreateInfo,
                           const VkAllocationCallbacks *pAllocator,
                           VkPipeline *pPipeline)
 {
+   TU_FROM_HANDLE(tu_device, dev, device);
+   const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
+   VkResult result;
+
+   struct tu_pipeline *pipeline;
+
+   result = tu_pipeline_create(dev, pAllocator, &pipeline);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct tu_shader_compile_options options;
+   tu_shader_compile_options_init(&options, NULL);
+
+   struct tu_shader *shader =
+      tu_shader_create(dev, MESA_SHADER_COMPUTE, stage_info, pAllocator);
+   if (!shader) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail;
+   }
+
+   result = tu_shader_compile(dev, shader, NULL, &options, pAllocator);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct tu_program_descriptor_linkage *link = &pipeline->program.link[MESA_SHADER_COMPUTE];
+   struct ir3_shader_variant *v = &shader->variants[0];
+
+   link->ubo_state = v->shader->ubo_state;
+   link->const_state = v->shader->const_state;
+   link->constlen = v->constlen;
+   link->texture_map = shader->texture_map;
+   link->sampler_map = shader->sampler_map;
+   link->ubo_map = shader->ubo_map;
+   link->ssbo_map = shader->ssbo_map;
+   link->image_mapping =  v->image_mapping;
+
+   result = tu_compute_upload_shader(device, pipeline, shader);
+   if (result != VK_SUCCESS)
+      return result;
+
+   for (int i = 0; i < 3; i++)
+      pipeline->compute.local_size[i] = v->shader->nir->info.cs.local_size[i];
+
+   struct tu_cs prog_cs;
+   tu_cs_begin_sub_stream(dev, &pipeline->cs, 512, &prog_cs);
+   tu6_emit_compute_program(&prog_cs, shader, &pipeline->program.binary_bo);
+   pipeline->program.state_ib = tu_cs_end_sub_stream(&pipeline->cs, &prog_cs);
+
+   *pPipeline = tu_pipeline_to_handle(pipeline);
   return VK_SUCCESS;
+
+fail:
+   tu_shader_destroy(dev, shader, pAllocator);
+   if (result != VK_SUCCESS) {
+      tu_pipeline_finish(pipeline, dev, pAllocator);
+      vk_free2(&dev->alloc, pAllocator, pipeline);
+   }
+
+   return result;
 }

 VkResult
-tu_CreateComputePipelines(VkDevice _device,
+tu_CreateComputePipelines(VkDevice device,
                          VkPipelineCache pipelineCache,
                          uint32_t count,
                          const VkComputePipelineCreateInfo *pCreateInfos,
                          const VkAllocationCallbacks *pAllocator,
                          VkPipeline *pPipelines)
 {
-   VkResult result = VK_SUCCESS;
+   VkResult final_result = VK_SUCCESS;

-   unsigned i = 0;
-   for (; i < count; i++) {
-      VkResult r;
-      r = tu_compute_pipeline_create(_device, pipelineCache, &pCreateInfos[i],
-                                     pAllocator, &pPipelines[i]);
-      if (r != VK_SUCCESS) {
-         result = r;
-      }
-      pPipelines[i] = VK_NULL_HANDLE;
+   for (uint32_t i = 0; i < count; i++) {
+      VkResult result = tu_compute_pipeline_create(device, pipelineCache,
+                                                   &pCreateInfos[i],
+                                                   pAllocator, &pPipelines[i]);
+      if (result != VK_SUCCESS)
+         final_result = result;
   }

-   return result;
+   return final_result;
 }

 void
--- a/src/freedreno/vulkan/tu_private.h
+++ b/src/freedreno/vulkan/tu_private.h
@ -825,8 +825,9 @@ struct tu_tiling_config
 enum tu_cmd_dirty_bits
 {
   TU_CMD_DIRTY_PIPELINE = 1 << 0,
-   TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 1,
-   TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 2,
+   TU_CMD_DIRTY_COMPUTE_PIPELINE = 1 << 1,
+   TU_CMD_DIRTY_VERTEX_BUFFERS = 1 << 2,
+   TU_CMD_DIRTY_DESCRIPTOR_SETS = 1 << 3,

   TU_CMD_DIRTY_DYNAMIC_LINE_WIDTH = 1 << 16,
   TU_CMD_DIRTY_DYNAMIC_STENCIL_COMPARE_MASK = 1 << 17,
@ -839,6 +840,7 @@ struct tu_cmd_state
   uint32_t dirty;

   struct tu_pipeline *pipeline;
+   struct tu_pipeline *compute_pipeline;

   /* Vertex buffers */
   struct
@ -1167,6 +1169,11 @@ struct tu_pipeline
   {
      struct tu_cs_entry state_ib;
   } blend;
+
+   struct
+   {
+      uint32_t local_size[3];
+   } compute;
 };

 void