mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 04:48:08 +02:00
turnip: Add support for devices not supporting double thread size.
On these devices the actual thread size for compute shaders seems to be controlled by REG_A6XX_HLSQ_FS_CNTL_0 rather than the CS-related register. Signed-off-by: Amber Amber <amber@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20991>
This commit is contained in:
parent
9d2fd39f84
commit
f78bce1b59
4 changed files with 29 additions and 6 deletions
|
|
@ -181,6 +181,8 @@ struct fd_dev_info {
|
||||||
|
|
||||||
uint32_t vs_max_inputs_count;
|
uint32_t vs_max_inputs_count;
|
||||||
|
|
||||||
|
bool supports_double_threadsize;
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
uint32_t PC_POWER_CNTL;
|
uint32_t PC_POWER_CNTL;
|
||||||
uint32_t TPL1_DBG_ECO_CNTL;
|
uint32_t TPL1_DBG_ECO_CNTL;
|
||||||
|
|
|
||||||
|
|
@ -249,6 +249,7 @@ a6xx_gen1 = dict(
|
||||||
concurrent_resolve = False,
|
concurrent_resolve = False,
|
||||||
indirect_draw_wfm_quirk = True,
|
indirect_draw_wfm_quirk = True,
|
||||||
depth_bounds_require_depth_test_quirk = True,
|
depth_bounds_require_depth_test_quirk = True,
|
||||||
|
supports_double_threadsize = True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# a605, a608, a610, 612
|
# a605, a608, a610, 612
|
||||||
|
|
@ -259,6 +260,7 @@ a6xx_gen1_low = {**a6xx_gen1, **dict(
|
||||||
sysmem_per_ccu_cache_size = 8 * 1024,
|
sysmem_per_ccu_cache_size = 8 * 1024,
|
||||||
gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value,
|
gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value,
|
||||||
vs_max_inputs_count = 16,
|
vs_max_inputs_count = 16,
|
||||||
|
supports_double_threadsize = False,
|
||||||
)}
|
)}
|
||||||
|
|
||||||
# a640, a680:
|
# a640, a680:
|
||||||
|
|
@ -272,6 +274,7 @@ a6xx_gen2 = dict(
|
||||||
depth_bounds_require_depth_test_quirk = True, # TODO: check if true
|
depth_bounds_require_depth_test_quirk = True, # TODO: check if true
|
||||||
has_dp2acc = False, # TODO: check if true
|
has_dp2acc = False, # TODO: check if true
|
||||||
has_8bpp_ubwc = False,
|
has_8bpp_ubwc = False,
|
||||||
|
supports_double_threadsize = True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# a650:
|
# a650:
|
||||||
|
|
@ -294,6 +297,7 @@ a6xx_gen3 = dict(
|
||||||
enable_lrz_fast_clear = True,
|
enable_lrz_fast_clear = True,
|
||||||
lrz_track_quirk = True,
|
lrz_track_quirk = True,
|
||||||
has_per_view_viewport = True,
|
has_per_view_viewport = True,
|
||||||
|
supports_double_threadsize = True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# a635, a660:
|
# a635, a660:
|
||||||
|
|
@ -320,6 +324,7 @@ a6xx_gen4 = dict(
|
||||||
enable_lrz_fast_clear = True,
|
enable_lrz_fast_clear = True,
|
||||||
has_lrz_dir_tracking = True,
|
has_lrz_dir_tracking = True,
|
||||||
has_per_view_viewport = True,
|
has_per_view_viewport = True,
|
||||||
|
supports_double_threadsize = True,
|
||||||
)
|
)
|
||||||
|
|
||||||
add_gpus([
|
add_gpus([
|
||||||
|
|
|
||||||
|
|
@ -853,7 +853,7 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
|
||||||
p->deviceNodeMask = 0;
|
p->deviceNodeMask = 0;
|
||||||
p->deviceLUIDValid = false;
|
p->deviceLUIDValid = false;
|
||||||
|
|
||||||
p->subgroupSize = 128;
|
p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
|
||||||
p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
|
p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||||
p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
|
p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
|
||||||
VK_SUBGROUP_FEATURE_VOTE_BIT |
|
VK_SUBGROUP_FEATURE_VOTE_BIT |
|
||||||
|
|
@ -974,7 +974,8 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
|
||||||
{
|
{
|
||||||
/* TODO move threadsize_base and max_waves to fd_dev_info and use them here */
|
/* TODO move threadsize_base and max_waves to fd_dev_info and use them here */
|
||||||
p->minSubgroupSize = 64; /* threadsize_base */
|
p->minSubgroupSize = 64; /* threadsize_base */
|
||||||
p->maxSubgroupSize = 128; /* threadsize_base * 2 */
|
p->maxSubgroupSize =
|
||||||
|
pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
|
||||||
p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
|
p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
|
||||||
p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
|
p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
|
||||||
|
|
||||||
|
|
@ -1095,7 +1096,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
|
||||||
.maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2,
|
.maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2,
|
||||||
.maxComputeSharedMemorySize = pdevice->info->cs_shared_mem_size,
|
.maxComputeSharedMemorySize = pdevice->info->cs_shared_mem_size,
|
||||||
.maxComputeWorkGroupCount = { 65535, 65535, 65535 },
|
.maxComputeWorkGroupCount = { 65535, 65535, 65535 },
|
||||||
.maxComputeWorkGroupInvocations = 2048,
|
.maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ? 2048 : 1024,
|
||||||
.maxComputeWorkGroupSize = { 1024, 1024, 1024 },
|
.maxComputeWorkGroupSize = { 1024, 1024, 1024 },
|
||||||
.subPixelPrecisionBits = 8,
|
.subPixelPrecisionBits = 8,
|
||||||
.subTexelPrecisionBits = 8,
|
.subTexelPrecisionBits = 8,
|
||||||
|
|
|
||||||
|
|
@ -504,6 +504,8 @@ tu6_emit_xs(struct tu_cs *cs,
|
||||||
));
|
));
|
||||||
break;
|
break;
|
||||||
case MESA_SHADER_COMPUTE:
|
case MESA_SHADER_COMPUTE:
|
||||||
|
thrsz = cs->device->physical_device->info->a6xx
|
||||||
|
.supports_double_threadsize ? thrsz : THREAD128;
|
||||||
tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
|
tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
|
||||||
.halfregfootprint = xs->info.max_half_reg + 1,
|
.halfregfootprint = xs->info.max_half_reg + 1,
|
||||||
.fullregfootprint = xs->info.max_reg + 1,
|
.fullregfootprint = xs->info.max_reg + 1,
|
||||||
|
|
@ -702,7 +704,14 @@ tu6_emit_cs_config(struct tu_cs *cs,
|
||||||
uint32_t work_group_id =
|
uint32_t work_group_id =
|
||||||
ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
|
ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Devices that do not support double threadsize take the threadsize from
|
||||||
|
* A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
|
||||||
|
* which is always set to THREAD128.
|
||||||
|
*/
|
||||||
enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
|
enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
|
||||||
|
enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
|
||||||
|
.supports_double_threadsize ? thrsz : THREAD128;
|
||||||
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
|
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
|
||||||
tu_cs_emit(cs,
|
tu_cs_emit(cs,
|
||||||
A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
|
A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
|
||||||
|
|
@ -710,7 +719,11 @@ tu6_emit_cs_config(struct tu_cs *cs,
|
||||||
A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
|
A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
|
||||||
A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
|
A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
|
||||||
tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
|
tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
|
||||||
A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
|
A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
|
||||||
|
if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
|
||||||
|
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
|
||||||
|
tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
|
||||||
|
}
|
||||||
|
|
||||||
if (cs->device->physical_device->info->a6xx.has_lpac) {
|
if (cs->device->physical_device->info->a6xx.has_lpac) {
|
||||||
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
|
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
|
||||||
|
|
@ -2294,8 +2307,10 @@ tu_shader_key_init(struct tu_shader_key *key,
|
||||||
struct tu_device *dev)
|
struct tu_device *dev)
|
||||||
{
|
{
|
||||||
enum ir3_wavesize_option api_wavesize, real_wavesize;
|
enum ir3_wavesize_option api_wavesize, real_wavesize;
|
||||||
|
if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
|
||||||
if (stage_info) {
|
api_wavesize = IR3_SINGLE_ONLY;
|
||||||
|
real_wavesize = IR3_SINGLE_ONLY;
|
||||||
|
} else if (stage_info) {
|
||||||
if (stage_info->flags &
|
if (stage_info->flags &
|
||||||
VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) {
|
VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) {
|
||||||
api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
|
api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue