From f78bce1b59cd47741b88a349ebe22050f9d3726b Mon Sep 17 00:00:00 2001
From: Amber <amber@igalia.com>
Date: Mon, 10 Apr 2023 13:42:25 +0200
Subject: [PATCH] turnip: Add support for devices not supporting double thread
 size.

On these devices the actual thread size for compute shaders seems to be
controlled by REG_A6XX_HLSQ_FS_CNTL_0 rather than the CS-related
register.

Signed-off-by: Amber Amber <amber@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20991>
---
 src/freedreno/common/freedreno_dev_info.h |  2 ++
 src/freedreno/common/freedreno_devices.py |  5 +++++
 src/freedreno/vulkan/tu_device.cc         |  7 ++++---
 src/freedreno/vulkan/tu_pipeline.cc       | 21 ++++++++++++++++++---
 4 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h
index ba8cc16d749..121a5e6f78e 100644
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -181,6 +181,8 @@ struct fd_dev_info {
 
          uint32_t vs_max_inputs_count;
 
+         bool supports_double_threadsize;
+
          struct {
             uint32_t PC_POWER_CNTL;
             uint32_t TPL1_DBG_ECO_CNTL;
diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py
index ae8db20ae8a..1a8a91640e1 100644
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -249,6 +249,7 @@ a6xx_gen1 = dict(
         concurrent_resolve = False,
         indirect_draw_wfm_quirk = True,
         depth_bounds_require_depth_test_quirk = True,
+        supports_double_threadsize = True,
     )
 
 # a605, a608, a610, 612
@@ -259,6 +260,7 @@ a6xx_gen1_low = {**a6xx_gen1, **dict(
         sysmem_per_ccu_cache_size = 8 * 1024,
         gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value,
         vs_max_inputs_count = 16,
+        supports_double_threadsize = False,
 )}
 
 # a640, a680:
@@ -272,6 +274,7 @@ a6xx_gen2 = dict(
         depth_bounds_require_depth_test_quirk = True, # TODO: check if true
         has_dp2acc = False, # TODO: check if true
         has_8bpp_ubwc = False,
+        supports_double_threadsize = True,
     )
 
 # a650:
@@ -294,6 +297,7 @@ a6xx_gen3 = dict(
         enable_lrz_fast_clear = True,
         lrz_track_quirk = True,
         has_per_view_viewport = True,
+        supports_double_threadsize = True,
     )
 
 # a635, a660:
@@ -320,6 +324,7 @@ a6xx_gen4 = dict(
         enable_lrz_fast_clear = True,
         has_lrz_dir_tracking = True,
         has_per_view_viewport = True,
+        supports_double_threadsize = True,
     )
 
 add_gpus([
diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc
index 0022f5ec6b0..eebac3234cc 100644
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -853,7 +853,7 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
    p->deviceNodeMask = 0;
    p->deviceLUIDValid = false;
 
-   p->subgroupSize = 128;
+   p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
    p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
    p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
                                     VK_SUBGROUP_FEATURE_VOTE_BIT |
@@ -974,7 +974,8 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
 {
    /* TODO move threadsize_base and max_waves to fd_dev_info and use them here */
    p->minSubgroupSize = 64; /* threadsize_base */
-   p->maxSubgroupSize = 128; /* threadsize_base * 2 */
+   p->maxSubgroupSize =
+      pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
    p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
    p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
 
@@ -1095,7 +1096,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
       .maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2,
       .maxComputeSharedMemorySize = pdevice->info->cs_shared_mem_size,
       .maxComputeWorkGroupCount = { 65535, 65535, 65535 },
-      .maxComputeWorkGroupInvocations = 2048,
+      .maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ? 2048 : 1024,
       .maxComputeWorkGroupSize = { 1024, 1024, 1024 },
       .subPixelPrecisionBits = 8,
       .subTexelPrecisionBits = 8,
diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc
index cf065b684a9..1b24997e151 100644
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@@ -504,6 +504,8 @@ tu6_emit_xs(struct tu_cs *cs,
       ));
       break;
    case MESA_SHADER_COMPUTE:
+      thrsz = cs->device->physical_device->info->a6xx
+            .supports_double_threadsize ? thrsz : THREAD128;
       tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
                .halfregfootprint = xs->info.max_half_reg + 1,
                .fullregfootprint = xs->info.max_reg + 1,
@@ -702,7 +704,14 @@ tu6_emit_cs_config(struct tu_cs *cs,
    uint32_t work_group_id =
       ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
 
+   /*
+    * Devices that do not support double threadsize take the threadsize from
+    * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
+    * which is always set to THREAD128.
+    */
    enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
+   enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
+      .supports_double_threadsize ? thrsz : THREAD128;
    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
    tu_cs_emit(cs,
               A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
@@ -710,7 +719,11 @@ tu6_emit_cs_config(struct tu_cs *cs,
               A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
               A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
    tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
-                  A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
+                  A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
+   if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
+      tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
+      tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
+   }
 
    if (cs->device->physical_device->info->a6xx.has_lpac) {
       tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
@@ -2294,8 +2307,10 @@ tu_shader_key_init(struct tu_shader_key *key,
                    struct tu_device *dev)
 {
    enum ir3_wavesize_option api_wavesize, real_wavesize;
-
-   if (stage_info) {
+   if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
+      api_wavesize = IR3_SINGLE_ONLY;
+      real_wavesize = IR3_SINGLE_ONLY;
+   } else if (stage_info) {
       if (stage_info->flags &
           VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) {
          api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;