From f78bce1b59cd47741b88a349ebe22050f9d3726b Mon Sep 17 00:00:00 2001 From: Amber Date: Mon, 10 Apr 2023 13:42:25 +0200 Subject: [PATCH] turnip: Add support for devices not supporting double thread size. On these devices the actual thread size for compute shaders seems to be controlled by REG_A6XX_HLSQ_FS_CNTL_0 rather than the CS-related register. Signed-off-by: Amber Amber Part-of: --- src/freedreno/common/freedreno_dev_info.h | 2 ++ src/freedreno/common/freedreno_devices.py | 5 +++++ src/freedreno/vulkan/tu_device.cc | 7 ++++--- src/freedreno/vulkan/tu_pipeline.cc | 21 ++++++++++++++++++--- 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index ba8cc16d749..121a5e6f78e 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -181,6 +181,8 @@ struct fd_dev_info { uint32_t vs_max_inputs_count; + bool supports_double_threadsize; + struct { uint32_t PC_POWER_CNTL; uint32_t TPL1_DBG_ECO_CNTL; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index ae8db20ae8a..1a8a91640e1 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -249,6 +249,7 @@ a6xx_gen1 = dict( concurrent_resolve = False, indirect_draw_wfm_quirk = True, depth_bounds_require_depth_test_quirk = True, + supports_double_threadsize = True, ) # a605, a608, a610, 612 @@ -259,6 +260,7 @@ a6xx_gen1_low = {**a6xx_gen1, **dict( sysmem_per_ccu_cache_size = 8 * 1024, gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value, vs_max_inputs_count = 16, + supports_double_threadsize = False, )} # a640, a680: @@ -272,6 +274,7 @@ a6xx_gen2 = dict( depth_bounds_require_depth_test_quirk = True, # TODO: check if true has_dp2acc = False, # TODO: check if true has_8bpp_ubwc = False, + supports_double_threadsize = True, ) # a650: @@ -294,6 +297,7 @@ a6xx_gen3 = dict( enable_lrz_fast_clear = True, lrz_track_quirk = True, has_per_view_viewport = True, + supports_double_threadsize = True, ) # a635, a660: @@ -320,6 +324,7 @@ a6xx_gen4 = dict( enable_lrz_fast_clear = True, has_lrz_dir_tracking = True, has_per_view_viewport = True, + supports_double_threadsize = True, ) add_gpus([ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index 0022f5ec6b0..eebac3234cc 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -853,7 +853,7 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice, p->deviceNodeMask = 0; p->deviceLUIDValid = false; - p->subgroupSize = 128; + p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ? 128 : 64; p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT; p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT | VK_SUBGROUP_FEATURE_VOTE_BIT | @@ -974,7 +974,8 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice, { /* TODO move threadsize_base and max_waves to fd_dev_info and use them here */ p->minSubgroupSize = 64; /* threadsize_base */ - p->maxSubgroupSize = 128; /* threadsize_base * 2 */ + p->maxSubgroupSize = + pdevice->info->a6xx.supports_double_threadsize ? 128 : 64; p->maxComputeWorkgroupSubgroups = 16; /* max_waves */ p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL; @@ -1095,7 +1096,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice, .maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2, .maxComputeSharedMemorySize = pdevice->info->cs_shared_mem_size, .maxComputeWorkGroupCount = { 65535, 65535, 65535 }, - .maxComputeWorkGroupInvocations = 2048, + .maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ? 2048 : 1024, .maxComputeWorkGroupSize = { 1024, 1024, 1024 }, .subPixelPrecisionBits = 8, .subTexelPrecisionBits = 8, diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index cf065b684a9..1b24997e151 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -504,6 +504,8 @@ tu6_emit_xs(struct tu_cs *cs, )); break; case MESA_SHADER_COMPUTE: + thrsz = cs->device->physical_device->info->a6xx + .supports_double_threadsize ? thrsz : THREAD128; tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0( .halfregfootprint = xs->info.max_half_reg + 1, .fullregfootprint = xs->info.max_reg + 1, @@ -702,7 +704,14 @@ tu6_emit_cs_config(struct tu_cs *cs, uint32_t work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID); + /* + * Devices that do not support double threadsize take the threadsize from + * A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE + * which is always set to THREAD128. + */ enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64; + enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx + .supports_double_threadsize ? thrsz : THREAD128; tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2); tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | @@ -710,7 +719,11 @@ tu6_emit_cs_config(struct tu_cs *cs, A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz)); + A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs)); + if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) { + tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1); + tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz)); + } if (cs->device->physical_device->info->a6xx.has_lpac) { tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2); @@ -2294,8 +2307,10 @@ tu_shader_key_init(struct tu_shader_key *key, struct tu_device *dev) { enum ir3_wavesize_option api_wavesize, real_wavesize; - - if (stage_info) { + if (!dev->physical_device->info->a6xx.supports_double_threadsize) { + api_wavesize = IR3_SINGLE_ONLY; + real_wavesize = IR3_SINGLE_ONLY; + } else if (stage_info) { if (stage_info->flags & VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) { api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;