diff --git a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp index 92fee82b7bf..53ca8824385 100644 --- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp +++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp @@ -4707,7 +4707,8 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) if (src.regClass() == s1) { bld.copy(Definition(dst), src); - } else if (dst.regClass() == v1 && src.regClass() == v1) { + } else if (dst.type() == RegType::vgpr && src.type() == RegType::vgpr && dst.size() == 1 && + src.size() == 1) { bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src, bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)), bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa))); diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index fc98e474202..c97dd34831a 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -623,7 +623,7 @@ intrinsic("write_invocation_amd", src_comp=[0, 0, 1], dest_comp=0, bit_sizes=src # src = [ mask, addition ] intrinsic("mbcnt_amd", src_comp=[1, 1], dest_comp=1, bit_sizes=[32], flags=[CAN_REORDER, CAN_ELIMINATE]) # Compiled to v_permlane16_b32. src = [ value, lanesel_lo, lanesel_hi ] -intrinsic("lane_permute_16_amd", src_comp=[1, 1, 1], dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE]) +intrinsic("lane_permute_16_amd", src_comp=[1, 1, 1], dest_comp=1, bit_sizes=src0, flags=[CAN_ELIMINATE]) # subgroup shuffle up/down with cluster size 16. # base in [-15, -1]: DPP_ROW_SR # base in [ 1, 15]: DPP_ROW_SL, otherwise invalid.