ir3, turnip: Support VK_*_compute_shader_derivatives on a7xx

Quad derivative groups are supported since a7xx using the tiling mode bit. Linear derivative groups may also work on a6xx but I haven't tested it yet. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31130>
2025-12-25 13:00:09 +01:00 · 2024-09-11 09:51:36 -04:00 · 2024-09-11 09:51:36 -04:00 · 022fb8e4c7
commit 022fb8e4c7
parent 624d83bfd1
3 changed files with 58 additions and 4 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@ -510,7 +510,7 @@ Khronos extensions that are not part of any Vulkan version:
  VK_KHR_acceleration_structure                         DONE (anv/gfx12.5+, lvp, radv/gfx10.3+)
  VK_KHR_android_surface                                not started
  VK_KHR_calibrated_timestamps                          DONE (anv, nvk, radv, tu/a750+)
-  VK_KHR_compute_shader_derivatives                     DONE (anv, nvk, radv)
+  VK_KHR_compute_shader_derivatives                     DONE (anv, nvk, radv, tu/a7xx+)
  VK_KHR_cooperative_matrix                             DONE (anv, radv/gfx11+)
  VK_KHR_deferred_host_operations                       DONE (anv, hasvk, lvp, radv)
  VK_KHR_display                                        DONE (anv, nvk, pvr, radv, tu, v3dv)
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@ -602,16 +602,58 @@ lower_subgroup_id(nir_builder *b, nir_instr *instr, void *_shader)
    * LocalInvocationIndex here. This means that whenever we do this lowering we
    * have to force linear dispatch to make sure that the relation between
    * SubgroupId/SubgroupLocalInvocationId and LocalInvocationIndex is what we
-    * expect.
+    * expect, unless the shader forces us to do the quad layout in which case we
+    * have to use the tiled layout.
    */
-   shader->cs.force_linear_dispatch = true;
-
   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+   if (intr->intrinsic == nir_intrinsic_load_subgroup_id &&
+       shader->nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
+      /* We have to manually figure out which subgroup we're in using the
+       * tiling. The tiling is 4x4, unless one of the dimensions is not a
+       * multiple of 4 in which case it drops to 2.
+       */
+      nir_def *local_size = nir_load_workgroup_size(b);
+      nir_def *local_size_x = nir_channel(b, local_size, 0);
+      nir_def *local_size_y = nir_channel(b, local_size, 1);
+      /* Calculate the shift from invocation to tile index for x and y */
+      nir_def *x_shift = nir_bcsel(b,
+                                   nir_ieq_imm(b,
+                                               nir_iand_imm(b, local_size_x, 3),
+                                               0),
+                                   nir_imm_int(b, 2), nir_imm_int(b, 1));
+      nir_def *y_shift = nir_bcsel(b,
+                                   nir_ieq_imm(b,
+                                               nir_iand_imm(b, local_size_y, 3),
+                                               0),
+                                   nir_imm_int(b, 2), nir_imm_int(b, 1));
+      nir_def *id = nir_load_local_invocation_id(b);
+      nir_def *id_x = nir_channel(b, id, 0);
+      nir_def *id_y = nir_channel(b, id, 1);
+      /* Calculate which tile we're in */
+      nir_def *tile_id =
+         nir_iadd(b, nir_imul24(b, nir_ishr(b, id_y, y_shift),
+                                nir_ishr(b, local_size_x, x_shift)),
+                  nir_ishr(b, id_x, x_shift));
+      /* Finally calculate the subgroup id */
+      return nir_ishr(b, tile_id, nir_isub(b,
+                                           nir_load_subgroup_id_shift_ir3(b),
+                                           nir_iadd(b, x_shift, y_shift)));
+   }
+
+   /* Just use getfiberid if we have to use tiling */
+   if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation &&
+       shader->nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
+      return NULL;
+   }
+
+
   if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
+      shader->cs.force_linear_dispatch = true;
      return nir_iand(
         b, nir_load_local_invocation_index(b),
         nir_iadd_imm(b, nir_load_subgroup_size(b), -1));
   } else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) {
+      shader->cs.force_linear_dispatch = true;
      return nir_ishr(b, nir_load_local_invocation_index(b),
                      nir_load_subgroup_id_shift_ir3(b));
   } else {
@ -771,6 +813,9 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
      bool progress = false;
      NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs, shader);

+      if (s->info.derivative_group == DERIVATIVE_GROUP_LINEAR)
+         shader->cs.force_linear_dispatch = true;
+
      /* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which
       * we need to lower again.
       */
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -148,6 +148,7 @@ get_device_extensions(const struct tu_physical_device *device,
      .KHR_bind_memory2 = true,
      .KHR_buffer_device_address = true,
      .KHR_calibrated_timestamps = device->info->a7xx.has_persistent_counter,
+      .KHR_compute_shader_derivatives = device->info->chip >= 7,
      .KHR_copy_commands2 = true,
      .KHR_create_renderpass2 = true,
      .KHR_dedicated_allocation = true,
@ -308,6 +309,7 @@ get_device_extensions(const struct tu_physical_device *device,
      .GOOGLE_hlsl_functionality1 = true,
      .GOOGLE_user_type = true,
      .IMG_filter_cubic = device->info->a6xx.has_tex_filter_cubic,
+      .NV_compute_shader_derivatives = device->info->chip >= 7,
      .VALVE_mutable_descriptor_type = true,
   } };

@ -455,6 +457,10 @@ tu_get_features(struct tu_physical_device *pdevice,
   features->shaderIntegerDotProduct             = true;
   features->maintenance4                        = true;

+   /* VK_KHR_compute_shader_derivatives */
+   features->computeDerivativeGroupQuads = pdevice->info->chip >= 7;
+   features->computeDerivativeGroupLinear = pdevice->info->chip >= 7;
+
   /* VK_KHR_index_type_uint8 */
   features->indexTypeUint8 = true;

@ -1023,6 +1029,9 @@ tu_get_properties(struct tu_physical_device *pdevice,
   tu_get_physical_device_properties_1_2(pdevice, props);
   tu_get_physical_device_properties_1_3(pdevice, props);

+   /* VK_KHR_compute_shader_derivatives */
+   props->meshAndTaskShaderDerivatives = false;
+
   /* VK_KHR_push_descriptor */
   props->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;