turnip: Add support for devices not supporting double thread size.

On these devices the actual thread size for compute shaders seems to be controlled by REG_A6XX_HLSQ_FS_CNTL_0 rather than the CS-related register. Signed-off-by: Amber Amber <amber@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20991>
2026-02-15 04:30:29 +01:00 · 2023-04-10 13:42:25 +02:00 · 2023-04-10 13:42:25 +02:00 · f78bce1b59
commit f78bce1b59
parent 9d2fd39f84
4 changed files with 29 additions and 6 deletions
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@ -181,6 +181,8 @@ struct fd_dev_info {

         uint32_t vs_max_inputs_count;

+         bool supports_double_threadsize;
+
         struct {
            uint32_t PC_POWER_CNTL;
            uint32_t TPL1_DBG_ECO_CNTL;
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@ -249,6 +249,7 @@ a6xx_gen1 = dict(
        concurrent_resolve = False,
        indirect_draw_wfm_quirk = True,
        depth_bounds_require_depth_test_quirk = True,
+        supports_double_threadsize = True,
    )

 # a605, a608, a610, 612
@ -259,6 +260,7 @@ a6xx_gen1_low = {**a6xx_gen1, **dict(
        sysmem_per_ccu_cache_size = 8 * 1024,
        gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value,
        vs_max_inputs_count = 16,
+        supports_double_threadsize = False,
 )}

 # a640, a680:
@ -272,6 +274,7 @@ a6xx_gen2 = dict(
        depth_bounds_require_depth_test_quirk = True, # TODO: check if true
        has_dp2acc = False, # TODO: check if true
        has_8bpp_ubwc = False,
+        supports_double_threadsize = True,
    )

 # a650:
@ -294,6 +297,7 @@ a6xx_gen3 = dict(
        enable_lrz_fast_clear = True,
        lrz_track_quirk = True,
        has_per_view_viewport = True,
+        supports_double_threadsize = True,
    )

 # a635, a660:
@ -320,6 +324,7 @@ a6xx_gen4 = dict(
        enable_lrz_fast_clear = True,
        has_lrz_dir_tracking = True,
        has_per_view_viewport = True,
+        supports_double_threadsize = True,
    )

 add_gpus([
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@ -853,7 +853,7 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
   p->deviceNodeMask = 0;
   p->deviceLUIDValid = false;

-   p->subgroupSize = 128;
+   p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
   p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
   p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
                                    VK_SUBGROUP_FEATURE_VOTE_BIT |
@ -974,7 +974,8 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
 {
   /* TODO move threadsize_base and max_waves to fd_dev_info and use them here */
   p->minSubgroupSize = 64; /* threadsize_base */
-   p->maxSubgroupSize = 128; /* threadsize_base * 2 */
+   p->maxSubgroupSize =
+      pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
   p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
   p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;

@ -1095,7 +1096,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
      .maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2,
      .maxComputeSharedMemorySize = pdevice->info->cs_shared_mem_size,
      .maxComputeWorkGroupCount = { 65535, 65535, 65535 },
-      .maxComputeWorkGroupInvocations = 2048,
+      .maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ? 2048 : 1024,
      .maxComputeWorkGroupSize = { 1024, 1024, 1024 },
      .subPixelPrecisionBits = 8,
      .subTexelPrecisionBits = 8,
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@ -504,6 +504,8 @@ tu6_emit_xs(struct tu_cs *cs,
      ));
      break;
   case MESA_SHADER_COMPUTE:
+      thrsz = cs->device->physical_device->info->a6xx
+            .supports_double_threadsize ? thrsz : THREAD128;
      tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
               .halfregfootprint = xs->info.max_half_reg + 1,
               .fullregfootprint = xs->info.max_reg + 1,
@ -702,7 +704,14 @@ tu6_emit_cs_config(struct tu_cs *cs,
   uint32_t work_group_id =
      ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);

+   /*
+    * Devices that do not support double threadsize take the threadsize from
+    * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
+    * which is always set to THREAD128.
+    */
   enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
+   enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
+      .supports_double_threadsize ? thrsz : THREAD128;
   tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
   tu_cs_emit(cs,
              A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
@ -710,7 +719,11 @@ tu6_emit_cs_config(struct tu_cs *cs,
              A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
              A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
   tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
-                  A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
+                  A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
+   if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
+      tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
+      tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
+   }

   if (cs->device->physical_device->info->a6xx.has_lpac) {
      tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
@ -2294,8 +2307,10 @@ tu_shader_key_init(struct tu_shader_key *key,
                   struct tu_device *dev)
 {
   enum ir3_wavesize_option api_wavesize, real_wavesize;
-
-   if (stage_info) {
+   if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
+      api_wavesize = IR3_SINGLE_ONLY;
+      real_wavesize = IR3_SINGLE_ONLY;
+   } else if (stage_info) {
      if (stage_info->flags &
          VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) {
         api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;