mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-02-15 04:30:29 +01:00
turnip: Add support for devices not supporting double thread size.
On these devices the actual thread size for compute shaders seems to be controlled by REG_A6XX_HLSQ_FS_CNTL_0 rather than the CS-related register. Signed-off-by: Amber Amber <amber@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20991>
This commit is contained in:
parent
9d2fd39f84
commit
f78bce1b59
4 changed files with 29 additions and 6 deletions
|
|
@ -181,6 +181,8 @@ struct fd_dev_info {
|
|||
|
||||
uint32_t vs_max_inputs_count;
|
||||
|
||||
bool supports_double_threadsize;
|
||||
|
||||
struct {
|
||||
uint32_t PC_POWER_CNTL;
|
||||
uint32_t TPL1_DBG_ECO_CNTL;
|
||||
|
|
|
|||
|
|
@ -249,6 +249,7 @@ a6xx_gen1 = dict(
|
|||
concurrent_resolve = False,
|
||||
indirect_draw_wfm_quirk = True,
|
||||
depth_bounds_require_depth_test_quirk = True,
|
||||
supports_double_threadsize = True,
|
||||
)
|
||||
|
||||
# a605, a608, a610, 612
|
||||
|
|
@ -259,6 +260,7 @@ a6xx_gen1_low = {**a6xx_gen1, **dict(
|
|||
sysmem_per_ccu_cache_size = 8 * 1024,
|
||||
gmem_ccu_color_cache_fraction = CCUColorCacheFraction.HALF.value,
|
||||
vs_max_inputs_count = 16,
|
||||
supports_double_threadsize = False,
|
||||
)}
|
||||
|
||||
# a640, a680:
|
||||
|
|
@ -272,6 +274,7 @@ a6xx_gen2 = dict(
|
|||
depth_bounds_require_depth_test_quirk = True, # TODO: check if true
|
||||
has_dp2acc = False, # TODO: check if true
|
||||
has_8bpp_ubwc = False,
|
||||
supports_double_threadsize = True,
|
||||
)
|
||||
|
||||
# a650:
|
||||
|
|
@ -294,6 +297,7 @@ a6xx_gen3 = dict(
|
|||
enable_lrz_fast_clear = True,
|
||||
lrz_track_quirk = True,
|
||||
has_per_view_viewport = True,
|
||||
supports_double_threadsize = True,
|
||||
)
|
||||
|
||||
# a635, a660:
|
||||
|
|
@ -320,6 +324,7 @@ a6xx_gen4 = dict(
|
|||
enable_lrz_fast_clear = True,
|
||||
has_lrz_dir_tracking = True,
|
||||
has_per_view_viewport = True,
|
||||
supports_double_threadsize = True,
|
||||
)
|
||||
|
||||
add_gpus([
|
||||
|
|
|
|||
|
|
@ -853,7 +853,7 @@ tu_get_physical_device_properties_1_1(struct tu_physical_device *pdevice,
|
|||
p->deviceNodeMask = 0;
|
||||
p->deviceLUIDValid = false;
|
||||
|
||||
p->subgroupSize = 128;
|
||||
p->subgroupSize = pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
|
||||
p->subgroupSupportedStages = VK_SHADER_STAGE_COMPUTE_BIT;
|
||||
p->subgroupSupportedOperations = VK_SUBGROUP_FEATURE_BASIC_BIT |
|
||||
VK_SUBGROUP_FEATURE_VOTE_BIT |
|
||||
|
|
@ -974,7 +974,8 @@ tu_get_physical_device_properties_1_3(struct tu_physical_device *pdevice,
|
|||
{
|
||||
/* TODO move threadsize_base and max_waves to fd_dev_info and use them here */
|
||||
p->minSubgroupSize = 64; /* threadsize_base */
|
||||
p->maxSubgroupSize = 128; /* threadsize_base * 2 */
|
||||
p->maxSubgroupSize =
|
||||
pdevice->info->a6xx.supports_double_threadsize ? 128 : 64;
|
||||
p->maxComputeWorkgroupSubgroups = 16; /* max_waves */
|
||||
p->requiredSubgroupSizeStages = VK_SHADER_STAGE_ALL;
|
||||
|
||||
|
|
@ -1095,7 +1096,7 @@ tu_GetPhysicalDeviceProperties2(VkPhysicalDevice physicalDevice,
|
|||
.maxFragmentCombinedOutputResources = MAX_RTS + max_descriptor_set_size * 2,
|
||||
.maxComputeSharedMemorySize = pdevice->info->cs_shared_mem_size,
|
||||
.maxComputeWorkGroupCount = { 65535, 65535, 65535 },
|
||||
.maxComputeWorkGroupInvocations = 2048,
|
||||
.maxComputeWorkGroupInvocations = pdevice->info->a6xx.supports_double_threadsize ? 2048 : 1024,
|
||||
.maxComputeWorkGroupSize = { 1024, 1024, 1024 },
|
||||
.subPixelPrecisionBits = 8,
|
||||
.subTexelPrecisionBits = 8,
|
||||
|
|
|
|||
|
|
@ -504,6 +504,8 @@ tu6_emit_xs(struct tu_cs *cs,
|
|||
));
|
||||
break;
|
||||
case MESA_SHADER_COMPUTE:
|
||||
thrsz = cs->device->physical_device->info->a6xx
|
||||
.supports_double_threadsize ? thrsz : THREAD128;
|
||||
tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
|
||||
.halfregfootprint = xs->info.max_half_reg + 1,
|
||||
.fullregfootprint = xs->info.max_reg + 1,
|
||||
|
|
@ -702,7 +704,14 @@ tu6_emit_cs_config(struct tu_cs *cs,
|
|||
uint32_t work_group_id =
|
||||
ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
|
||||
|
||||
/*
|
||||
* Devices that do not support double threadsize take the threadsize from
|
||||
* A6XX_HLSQ_FS_CNTL_0_THREADSIZE instead of A6XX_HLSQ_CS_CNTL_1_THREADSIZE
|
||||
* which is always set to THREAD128.
|
||||
*/
|
||||
enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
|
||||
enum a6xx_threadsize thrsz_cs = cs->device->physical_device->info->a6xx
|
||||
.supports_double_threadsize ? thrsz : THREAD128;
|
||||
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
|
||||
tu_cs_emit(cs,
|
||||
A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
|
||||
|
|
@ -710,7 +719,11 @@ tu6_emit_cs_config(struct tu_cs *cs,
|
|||
A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
|
||||
A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
|
||||
tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
|
||||
A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
|
||||
A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz_cs));
|
||||
if (!cs->device->physical_device->info->a6xx.supports_double_threadsize) {
|
||||
tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
|
||||
tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz));
|
||||
}
|
||||
|
||||
if (cs->device->physical_device->info->a6xx.has_lpac) {
|
||||
tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
|
||||
|
|
@ -2294,8 +2307,10 @@ tu_shader_key_init(struct tu_shader_key *key,
|
|||
struct tu_device *dev)
|
||||
{
|
||||
enum ir3_wavesize_option api_wavesize, real_wavesize;
|
||||
|
||||
if (stage_info) {
|
||||
if (!dev->physical_device->info->a6xx.supports_double_threadsize) {
|
||||
api_wavesize = IR3_SINGLE_ONLY;
|
||||
real_wavesize = IR3_SINGLE_ONLY;
|
||||
} else if (stage_info) {
|
||||
if (stage_info->flags &
|
||||
VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) {
|
||||
api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue