diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 7386d15e126..16babf60453 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -991,7 +991,8 @@ emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, T } void -emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp) +emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp, + unsigned neg_lo = 0) { Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)}; bool has_sgpr = false; @@ -1005,7 +1006,11 @@ emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Te Builder bld(ctx->program, ctx->block); bld.is_precise = instr->exact; - bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7).instr->vop3p().clamp = clamp; + VOP3P_instruction& vop3p = + bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7).instr->vop3p(); + vop3p.clamp = clamp; + u_foreach_bit (i, neg_lo) + vop3p.neg_lo[i] = true; } void @@ -2459,11 +2464,25 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) break; } case nir_op_sdot_4x8_iadd: { - emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false); + if (ctx->options->gfx_level >= GFX11) + emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3); + else + emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false); break; } case nir_op_sdot_4x8_iadd_sat: { - emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true); + if (ctx->options->gfx_level >= GFX11) + emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3); + else + emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true); + break; + } + case nir_op_sudot_4x8_iadd: { + emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1); + break; + } + case nir_op_sudot_4x8_iadd_sat: { + emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1); break; } case nir_op_udot_4x8_uadd: { diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp index d535c47fc79..e655726755a 100644 --- a/src/amd/compiler/aco_instruction_selection_setup.cpp +++ b/src/amd/compiler/aco_instruction_selection_setup.cpp @@ -523,8 +523,10 @@ init_context(isel_context* ctx, nir_shader* shader) case nir_op_sad_u8x4: case nir_op_udot_4x8_uadd: case nir_op_sdot_4x8_iadd: + case nir_op_sudot_4x8_iadd: case nir_op_udot_4x8_uadd_sat: case nir_op_sdot_4x8_iadd_sat: + case nir_op_sudot_4x8_iadd_sat: case nir_op_udot_2x16_uadd: case nir_op_sdot_2x16_iadd: case nir_op_udot_2x16_uadd_sat: diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index bc1fbf2b37e..3c78e7eeefb 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -931,7 +931,8 @@ propagate_constants_vop3p(opt_ctx& ctx, aco_ptr& instr, ssa_info& i /* The accumulation operand of dot product instructions ignores opsel. */ bool cannot_use_opsel = (instr->opcode == aco_opcode::v_dot4_i32_i8 || instr->opcode == aco_opcode::v_dot2_i32_i16 || - instr->opcode == aco_opcode::v_dot4_u32_u8 || instr->opcode == aco_opcode::v_dot2_u32_u16) && + instr->opcode == aco_opcode::v_dot4_i32_iu8 || instr->opcode == aco_opcode::v_dot4_u32_u8 || + instr->opcode == aco_opcode::v_dot2_u32_u16) && i == 2; if (cannot_use_opsel) return;