ir3, turnip: Support VK_*_compute_shader_derivatives on a7xx

Quad derivative groups are supported since a7xx using the tiling mode
bit. Linear derivative groups may also work on a6xx but I haven't tested
it yet.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31130>
This commit is contained in:
Connor Abbott 2024-09-11 09:51:36 -04:00 committed by Marge Bot
parent 624d83bfd1
commit 022fb8e4c7
3 changed files with 58 additions and 4 deletions

View file

@ -510,7 +510,7 @@ Khronos extensions that are not part of any Vulkan version:
VK_KHR_acceleration_structure DONE (anv/gfx12.5+, lvp, radv/gfx10.3+)
VK_KHR_android_surface not started
VK_KHR_calibrated_timestamps DONE (anv, nvk, radv, tu/a750+)
VK_KHR_compute_shader_derivatives DONE (anv, nvk, radv)
VK_KHR_compute_shader_derivatives DONE (anv, nvk, radv, tu/a7xx+)
VK_KHR_cooperative_matrix DONE (anv, radv/gfx11+)
VK_KHR_deferred_host_operations DONE (anv, hasvk, lvp, radv)
VK_KHR_display DONE (anv, nvk, pvr, radv, tu, v3dv)

View file

@ -602,16 +602,58 @@ lower_subgroup_id(nir_builder *b, nir_instr *instr, void *_shader)
* LocalInvocationIndex here. This means that whenever we do this lowering we
* have to force linear dispatch to make sure that the relation between
* SubgroupId/SubgroupLocalInvocationId and LocalInvocationIndex is what we
* expect.
* expect, unless the shader forces us to do the quad layout in which case we
* have to use the tiled layout.
*/
shader->cs.force_linear_dispatch = true;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic == nir_intrinsic_load_subgroup_id &&
shader->nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
/* We have to manually figure out which subgroup we're in using the
* tiling. The tiling is 4x4, unless one of the dimensions is not a
* multiple of 4 in which case it drops to 2.
*/
nir_def *local_size = nir_load_workgroup_size(b);
nir_def *local_size_x = nir_channel(b, local_size, 0);
nir_def *local_size_y = nir_channel(b, local_size, 1);
/* Calculate the shift from invocation to tile index for x and y */
nir_def *x_shift = nir_bcsel(b,
nir_ieq_imm(b,
nir_iand_imm(b, local_size_x, 3),
0),
nir_imm_int(b, 2), nir_imm_int(b, 1));
nir_def *y_shift = nir_bcsel(b,
nir_ieq_imm(b,
nir_iand_imm(b, local_size_y, 3),
0),
nir_imm_int(b, 2), nir_imm_int(b, 1));
nir_def *id = nir_load_local_invocation_id(b);
nir_def *id_x = nir_channel(b, id, 0);
nir_def *id_y = nir_channel(b, id, 1);
/* Calculate which tile we're in */
nir_def *tile_id =
nir_iadd(b, nir_imul24(b, nir_ishr(b, id_y, y_shift),
nir_ishr(b, local_size_x, x_shift)),
nir_ishr(b, id_x, x_shift));
/* Finally calculate the subgroup id */
return nir_ishr(b, tile_id, nir_isub(b,
nir_load_subgroup_id_shift_ir3(b),
nir_iadd(b, x_shift, y_shift)));
}
/* Just use getfiberid if we have to use tiling */
if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation &&
shader->nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) {
return NULL;
}
if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
shader->cs.force_linear_dispatch = true;
return nir_iand(
b, nir_load_local_invocation_index(b),
nir_iadd_imm(b, nir_load_subgroup_size(b), -1));
} else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) {
shader->cs.force_linear_dispatch = true;
return nir_ishr(b, nir_load_local_invocation_index(b),
nir_load_subgroup_id_shift_ir3(b));
} else {
@ -771,6 +813,9 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
bool progress = false;
NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs, shader);
if (s->info.derivative_group == DERIVATIVE_GROUP_LINEAR)
shader->cs.force_linear_dispatch = true;
/* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which
* we need to lower again.
*/

View file

@ -148,6 +148,7 @@ get_device_extensions(const struct tu_physical_device *device,
.KHR_bind_memory2 = true,
.KHR_buffer_device_address = true,
.KHR_calibrated_timestamps = device->info->a7xx.has_persistent_counter,
.KHR_compute_shader_derivatives = device->info->chip >= 7,
.KHR_copy_commands2 = true,
.KHR_create_renderpass2 = true,
.KHR_dedicated_allocation = true,
@ -308,6 +309,7 @@ get_device_extensions(const struct tu_physical_device *device,
.GOOGLE_hlsl_functionality1 = true,
.GOOGLE_user_type = true,
.IMG_filter_cubic = device->info->a6xx.has_tex_filter_cubic,
.NV_compute_shader_derivatives = device->info->chip >= 7,
.VALVE_mutable_descriptor_type = true,
} };
@ -455,6 +457,10 @@ tu_get_features(struct tu_physical_device *pdevice,
features->shaderIntegerDotProduct = true;
features->maintenance4 = true;
/* VK_KHR_compute_shader_derivatives */
features->computeDerivativeGroupQuads = pdevice->info->chip >= 7;
features->computeDerivativeGroupLinear = pdevice->info->chip >= 7;
/* VK_KHR_index_type_uint8 */
features->indexTypeUint8 = true;
@ -1023,6 +1029,9 @@ tu_get_properties(struct tu_physical_device *pdevice,
tu_get_physical_device_properties_1_2(pdevice, props);
tu_get_physical_device_properties_1_3(pdevice, props);
/* VK_KHR_compute_shader_derivatives */
props->meshAndTaskShaderDerivatives = false;
/* VK_KHR_push_descriptor */
props->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;