ac: Add task shader ring information.

Similarly to tessellation rings information, move the task rings info to ac_gpu_info. Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16737>
2025-12-25 04:20:08 +01:00 · 2022-05-31 13:20:23 +02:00 · 2022-05-31 13:20:23 +02:00 · ac5ab8d227
commit ac5ab8d227
parent 086e499b47
6 changed files with 89 additions and 30 deletions
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -1858,3 +1858,42 @@ void ac_get_hs_info(struct radeon_info *info,
   hs->tess_offchip_ring_offset = align(hs->tess_factor_ring_size, 64 * 1024);
   hs->tess_offchip_ring_size = hs->max_offchip_buffers * hs->tess_offchip_block_dw_size * 4;
 }
+
+static uint16_t get_task_num_entries(enum radeon_family fam)
+{
+   /* Number of task shader ring entries. Needs to be a power of two.
+    * Use a low number on smaller chips so we don't waste space,
+    * but keep it high on bigger chips so it doesn't inhibit parallelism.
+    *
+    * This number is compiled into task/mesh shaders as a constant.
+    * In order to ensure this works fine with the shader cache, we must
+    * base this decision on the chip family, not the number of CUs in
+    * the current GPU. (So, the cache remains consistent for all
+    * chips in the same family.)
+    */
+   switch (fam) {
+   case CHIP_VANGOGH:
+   case CHIP_NAVI24:
+   case CHIP_REMBRANDT:
+      return 256;
+   case CHIP_NAVI21:
+   case CHIP_NAVI22:
+   case CHIP_NAVI23:
+   default:
+      return 1024;
+   }
+}
+
+void ac_get_task_info(struct radeon_info *info,
+                      struct ac_task_info *task_info)
+{
+   const uint16_t num_entries = get_task_num_entries(info->family);
+   const uint32_t draw_ring_bytes = num_entries * AC_TASK_DRAW_ENTRY_BYTES;
+   const uint32_t payload_ring_bytes = num_entries * AC_TASK_PAYLOAD_ENTRY_BYTES;
+
+   /* Ensure that the addresses of each ring are 256 byte aligned. */
+   task_info->num_entries = num_entries;
+   task_info->draw_ring_offset = ALIGN(AC_TASK_CTRLBUF_BYTES, 256);
+   task_info->payload_ring_offset = ALIGN(task_info->draw_ring_offset + draw_ring_bytes, 256);
+   task_info->bo_size_bytes = task_info->payload_ring_offset + payload_ring_bytes;
+}
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@ -274,6 +274,50 @@ struct ac_hs_info {
 void ac_get_hs_info(struct radeon_info *info,
                    struct ac_hs_info *hs);

+/* Task rings BO layout information.
+ * This BO is shared between GFX and ACE queues so that the ACE and GFX
+ * firmware can cooperate on task->mesh dispatches and is also used to
+ * store the task payload which is passed to mesh shaders.
+ *
+ * The driver only needs to create this BO once,
+ * and it will always be able to accomodate the maximum needed
+ * task payload size.
+ *
+ * The following memory layout is used:
+ * 1. Control buffer: 9 DWORDs, 256 byte aligned
+ *    Used by the firmware to maintain the current state.
+ * (padding)
+ * 2. Draw ring: 4 DWORDs per entry, 256 byte aligned
+ *    Task shaders store the mesh dispatch size here.
+ * (padding)
+ * 3. Payload ring: 16K bytes per entry, 256 byte aligned.
+ *    This is where task payload is stored by task shaders and
+ *    read by mesh shaders.
+ *
+ */
+struct ac_task_info {
+   uint32_t draw_ring_offset;
+   uint32_t payload_ring_offset;
+   uint32_t bo_size_bytes;
+   uint16_t num_entries;
+};
+
+/* Size of each payload entry in the task payload ring.
+ * Spec requires minimum 16K bytes.
+ */
+#define AC_TASK_PAYLOAD_ENTRY_BYTES 16384
+
+/* Size of each draw entry in the task draw ring.
+ * 4 DWORDs per entry.
+ */
+#define AC_TASK_DRAW_ENTRY_BYTES 16
+
+/* Size of the task control buffer. 9 DWORDs. */
+#define AC_TASK_CTRLBUF_BYTES 36
+
+void ac_get_task_info(struct radeon_info *info,
+                      struct ac_task_info *task_info);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/amd/vulkan/radv_constants.h
+++ b/src/amd/vulkan/radv_constants.h
@ -91,11 +91,6 @@
 */
 #define RADV_MAX_MEMORY_ALLOCATION_SIZE 0xFFFFFFFCull

-/* Size of each payload entry in the task payload ring.
- * Spec requires minimum 16K bytes.
- */
-#define RADV_TASK_PAYLOAD_ENTRY_BYTES 16384
-
 /* Number of invocations in each subgroup. */
 #define RADV_SUBGROUP_SIZE 64

--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@ -834,24 +834,7 @@ radv_physical_device_try_create(struct radv_instance *instance, drmDevicePtr drm
      ac_get_gs_table_depth(device->rad_info.gfx_level, device->rad_info.family);

   ac_get_hs_info(&device->rad_info, &device->hs);
-
-   /* Number of task shader ring entries. Needs to be a power of two.
-    * Use a low number on smaller chips so we don't waste space,
-    * but keep it high on bigger chips so it doesn't inhibit parallelism.
-    */
-   switch (device->rad_info.family) {
-   case CHIP_VANGOGH:
-   case CHIP_NAVI24:
-   case CHIP_REMBRANDT:
-      device->task_num_entries = 256;
-      break;
-   case CHIP_NAVI21:
-   case CHIP_NAVI22:
-   case CHIP_NAVI23:
-   default:
-      device->task_num_entries = 1024;
-      break;
-   }
+   ac_get_task_info(&device->rad_info, &device->task_info);

   *device_out = device;

--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@ -332,9 +332,7 @@ struct radv_physical_device {
   uint32_t gs_table_depth;

   struct ac_hs_info hs;
-
-   /* Number of entries in the task shader ring buffers. */
-   uint32_t task_num_entries;
+   struct ac_task_info task_info;
 };

 struct radv_instance {
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -1094,12 +1094,12 @@ radv_lower_io_to_mem(struct radv_device *device, struct radv_pipeline_stage *sta
      return true;
   } else if (nir->info.stage == MESA_SHADER_TASK) {
      ac_nir_apply_first_task_to_task_shader(nir);
-      ac_nir_lower_task_outputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES,
-                                       device->physical_device->task_num_entries);
+      ac_nir_lower_task_outputs_to_mem(nir, AC_TASK_PAYLOAD_ENTRY_BYTES,
+                                       device->physical_device->task_info.num_entries);
      return true;
   } else if (nir->info.stage == MESA_SHADER_MESH) {
-      ac_nir_lower_mesh_inputs_to_mem(nir, RADV_TASK_PAYLOAD_ENTRY_BYTES,
-                                      device->physical_device->task_num_entries);
+      ac_nir_lower_mesh_inputs_to_mem(nir, AC_TASK_PAYLOAD_ENTRY_BYTES,
+                                      device->physical_device->task_info.num_entries);
      return true;
   }