diff --git a/docs/features.txt b/docs/features.txt index 7e19b8a906b..f074230ae21 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -510,7 +510,7 @@ Khronos extensions that are not part of any Vulkan version: VK_KHR_acceleration_structure DONE (anv/gfx12.5+, lvp, radv/gfx10.3+) VK_KHR_android_surface not started VK_KHR_calibrated_timestamps DONE (anv, nvk, radv, tu/a750+) - VK_KHR_compute_shader_derivatives DONE (anv, nvk, radv) + VK_KHR_compute_shader_derivatives DONE (anv, nvk, radv, tu/a7xx+) VK_KHR_cooperative_matrix DONE (anv, radv/gfx11+) VK_KHR_deferred_host_operations DONE (anv, hasvk, lvp, radv) VK_KHR_display DONE (anv, nvk, pvr, radv, tu, v3dv) diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 615bfbff213..899b6e9a7ed 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -602,16 +602,58 @@ lower_subgroup_id(nir_builder *b, nir_instr *instr, void *_shader) * LocalInvocationIndex here. This means that whenever we do this lowering we * have to force linear dispatch to make sure that the relation between * SubgroupId/SubgroupLocalInvocationId and LocalInvocationIndex is what we - * expect. + * expect, unless the shader forces us to do the quad layout in which case we + * have to use the tiled layout. */ - shader->cs.force_linear_dispatch = true; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic == nir_intrinsic_load_subgroup_id && + shader->nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) { + /* We have to manually figure out which subgroup we're in using the + * tiling. The tiling is 4x4, unless one of the dimensions is not a + * multiple of 4 in which case it drops to 2. + */ + nir_def *local_size = nir_load_workgroup_size(b); + nir_def *local_size_x = nir_channel(b, local_size, 0); + nir_def *local_size_y = nir_channel(b, local_size, 1); + /* Calculate the shift from invocation to tile index for x and y */ + nir_def *x_shift = nir_bcsel(b, + nir_ieq_imm(b, + nir_iand_imm(b, local_size_x, 3), + 0), + nir_imm_int(b, 2), nir_imm_int(b, 1)); + nir_def *y_shift = nir_bcsel(b, + nir_ieq_imm(b, + nir_iand_imm(b, local_size_y, 3), + 0), + nir_imm_int(b, 2), nir_imm_int(b, 1)); + nir_def *id = nir_load_local_invocation_id(b); + nir_def *id_x = nir_channel(b, id, 0); + nir_def *id_y = nir_channel(b, id, 1); + /* Calculate which tile we're in */ + nir_def *tile_id = + nir_iadd(b, nir_imul24(b, nir_ishr(b, id_y, y_shift), + nir_ishr(b, local_size_x, x_shift)), + nir_ishr(b, id_x, x_shift)); + /* Finally calculate the subgroup id */ + return nir_ishr(b, tile_id, nir_isub(b, + nir_load_subgroup_id_shift_ir3(b), + nir_iadd(b, x_shift, y_shift))); + } + + /* Just use getfiberid if we have to use tiling */ + if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation && + shader->nir->info.derivative_group == DERIVATIVE_GROUP_QUADS) { + return NULL; + } + + if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) { + shader->cs.force_linear_dispatch = true; return nir_iand( b, nir_load_local_invocation_index(b), nir_iadd_imm(b, nir_load_subgroup_size(b), -1)); } else if (intr->intrinsic == nir_intrinsic_load_subgroup_id) { + shader->cs.force_linear_dispatch = true; return nir_ishr(b, nir_load_local_invocation_index(b), nir_load_subgroup_id_shift_ir3(b)); } else { @@ -771,6 +813,9 @@ ir3_nir_post_finalize(struct ir3_shader *shader) bool progress = false; NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs, shader); + if (s->info.derivative_group == DERIVATIVE_GROUP_LINEAR) + shader->cs.force_linear_dispatch = true; + /* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which * we need to lower again. */ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index db88bf97f2a..5545a36ad38 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -148,6 +148,7 @@ get_device_extensions(const struct tu_physical_device *device, .KHR_bind_memory2 = true, .KHR_buffer_device_address = true, .KHR_calibrated_timestamps = device->info->a7xx.has_persistent_counter, + .KHR_compute_shader_derivatives = device->info->chip >= 7, .KHR_copy_commands2 = true, .KHR_create_renderpass2 = true, .KHR_dedicated_allocation = true, @@ -308,6 +309,7 @@ get_device_extensions(const struct tu_physical_device *device, .GOOGLE_hlsl_functionality1 = true, .GOOGLE_user_type = true, .IMG_filter_cubic = device->info->a6xx.has_tex_filter_cubic, + .NV_compute_shader_derivatives = device->info->chip >= 7, .VALVE_mutable_descriptor_type = true, } }; @@ -455,6 +457,10 @@ tu_get_features(struct tu_physical_device *pdevice, features->shaderIntegerDotProduct = true; features->maintenance4 = true; + /* VK_KHR_compute_shader_derivatives */ + features->computeDerivativeGroupQuads = pdevice->info->chip >= 7; + features->computeDerivativeGroupLinear = pdevice->info->chip >= 7; + /* VK_KHR_index_type_uint8 */ features->indexTypeUint8 = true; @@ -1023,6 +1029,9 @@ tu_get_properties(struct tu_physical_device *pdevice, tu_get_physical_device_properties_1_2(pdevice, props); tu_get_physical_device_properties_1_3(pdevice, props); + /* VK_KHR_compute_shader_derivatives */ + props->meshAndTaskShaderDerivatives = false; + /* VK_KHR_push_descriptor */ props->maxPushDescriptors = MAX_PUSH_DESCRIPTORS;