diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index 7141bbf2c93..09536cad5b6 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -89,6 +89,31 @@ is_block_reachable(nir_function_impl *impl, nir_block *known_reachable, nir_bloc return false; } +bool +only_used_by_readlane_or_phi(nir_dest *dest) +{ + nir_src *src = list_first_entry(&dest->ssa.uses, nir_src, use_link); + + switch (src->parent_instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(src->parent_instr); + if (alu->op == nir_op_unpack_64_2x32_split_x || alu->op == nir_op_unpack_64_2x32_split_y) + return only_used_by_readlane_or_phi(&alu->dest.dest); + return false; + } + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr); + return intrin->intrinsic == nir_intrinsic_read_invocation || + intrin->intrinsic == nir_intrinsic_read_first_invocation || + intrin->intrinsic == nir_intrinsic_lane_permute_16_amd; + } + case nir_instr_type_phi: + return only_used_by_readlane_or_phi(&nir_instr_as_phi(src->parent_instr)->dest); + default: + return false; + } +} + /* If one side of a divergent IF ends in a branch and the other doesn't, we * might have to emit the contents of the side without the branch at the merge * block instead. This is so that we can use any SGPR live-out of the side @@ -830,6 +855,16 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_bvh64_intersect_ray_amd: type = RegType::vgpr; break; + case nir_intrinsic_load_shared: + /* When the result of these loads is only used by cross-lane instructions, + * it is beneficial to use a VGPR destination. This is because this allows + * to put the s_waitcnt further down, which decreases latency. + */ + if (only_used_by_readlane_or_phi(&intrinsic->dest)) { + type = RegType::vgpr; + break; + } + FALLTHROUGH; case nir_intrinsic_shuffle: case nir_intrinsic_quad_broadcast: case nir_intrinsic_quad_swap_horizontal: @@ -844,7 +879,6 @@ void init_context(isel_context *ctx, nir_shader *shader) case nir_intrinsic_load_ssbo: case nir_intrinsic_load_global: case nir_intrinsic_vulkan_resource_index: - case nir_intrinsic_load_shared: case nir_intrinsic_get_ssbo_size: type = nir_dest_is_divergent(intrinsic->dest) ? RegType::vgpr : RegType::sgpr; break;