From f3926c9d4edc8defe3e97a8d2f8c1b41cd6cc2b5 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Wed, 20 Nov 2024 09:40:55 +0100 Subject: [PATCH] aco/isel: use undef Operands for p_create_vector created from nir vecs Foz-DB Navi31: Totals from 27464 (34.59% of 79395) affected shaders: Instrs: 9595601 -> 9535260 (-0.63%); split: -0.63%, +0.00% CodeSize: 47900112 -> 47658648 (-0.50%); split: -0.50%, +0.00% Latency: 43928471 -> 43918448 (-0.02%); split: -0.05%, +0.02% InvThroughput: 4940105 -> 4903447 (-0.74%); split: -0.75%, +0.01% Copies: 667294 -> 604603 (-9.39%); split: -9.39%, +0.00% VALU: 5282264 -> 5219604 (-1.19%); split: -1.19%, +0.00% VOPD: 342 -> 311 (-9.06%) Reviewed-by: Rhys Perry Part-of: --- .../compiler/aco_instruction_selection.cpp | 57 ++++++++++++++----- src/amd/compiler/aco_util.h | 1 + 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 8781c9bf554..1c5c74321f9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -1258,11 +1258,15 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) { aco_ptr vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1)}; - RegClass elem_rc = RegClass::get(RegType::vgpr, instr->def.bit_size / 8u); + RegClass elem_rc = RegClass::get(dst.type(), instr->def.bit_size / 8u); for (unsigned i = 0; i < num; ++i) { if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword()) elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc); - vec->operands[i] = Operand{elems[i]}; + + if (nir_src_is_undef(instr->src[i].src)) + vec->operands[i] = Operand{elem_rc}; + else + vec->operands[i] = Operand{elems[i]}; } vec->definitions[0] = Definition(dst); ctx->block->instructions.emplace_back(std::move(vec)); @@ -1273,16 +1277,20 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) std::array packed; uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {}; + bitarray32 undef_mask = UINT32_MAX; for (unsigned i = 0; i < num; i++) { unsigned packed_size = use_s_pack ? 16 : 32; unsigned idx = i * instr->def.bit_size / packed_size; unsigned offset = i * instr->def.bit_size % packed_size; + if (nir_src_is_undef(instr->src[i].src)) + continue; + else + undef_mask[idx] = false; + if (nir_src_is_const(instr->src[i].src)) { const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset; continue; } - if (nir_src_is_undef(instr->src[i].src)) - continue; if (offset != packed_size - instr->def.bit_size) elems[i] = @@ -1313,7 +1321,9 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2], Operand::c32(const_vals[i * 2 + 1])); else - packed[i] = Temp(); /* Both constants, so reset the entry */ + packed[i] = Temp(0, s1); /* Both constants, so reset the entry */ + + undef_mask[i] = undef_mask[i * 2] && undef_mask[i * 2 + 1]; if (same) const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16); @@ -1326,11 +1336,11 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) if (const_vals[i] && packed[i].id()) packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(const_vals[i]), packed[i]); - else if (!packed[i].id()) + else if (!packed[i].id() && !undef_mask[i]) packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i])); } - if (dst.size() == 1) + if (dst.size() == 1 && packed[0].id()) bld.copy(Definition(dst), packed[0]); else { aco_ptr vec{ @@ -3330,10 +3340,16 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2); break; case nir_op_pack_64_2x32_split: { - Temp src0 = get_alu_src(ctx, instr->src[0]); - Temp src1 = get_alu_src(ctx, instr->src[1]); + Operand src[2]; + RegClass elem_rc = dst.regClass() == s2 ? s1 : v1; + for (unsigned i = 0; i < 2; i++) { + if (nir_src_is_undef(instr->src[i].src)) + src[i] = Operand(elem_rc); + else + src[i] = Operand(get_alu_src(ctx, instr->src[i])); + } - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src[0], src[1]); break; } case nir_op_unpack_64_2x32_split_x: @@ -3363,12 +3379,25 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) } break; case nir_op_pack_32_2x16_split: { - Temp src0 = get_alu_src(ctx, instr->src[0]); - Temp src1 = get_alu_src(ctx, instr->src[1]); + Operand src0 = Operand(get_alu_src(ctx, instr->src[0])); + Operand src1 = Operand(get_alu_src(ctx, instr->src[1])); if (dst.regClass() == v1) { - src0 = emit_extract_vector(ctx, src0, 0, v2b); - src1 = emit_extract_vector(ctx, src1, 0, v2b); + if (nir_src_is_undef(instr->src[0].src)) + src0 = Operand(v2b); + else + src0 = Operand(emit_extract_vector(ctx, src0.getTemp(), 0, v2b)); + + if (nir_src_is_undef(instr->src[1].src)) + src1 = Operand(v2b); + else + src1 = Operand(emit_extract_vector(ctx, src1.getTemp(), 0, v2b)); + bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); + } else if (nir_src_is_undef(instr->src[1].src)) { + bld.copy(Definition(dst), src0); + } else if (nir_src_is_undef(instr->src[0].src)) { + bld.pseudo(aco_opcode::p_insert, Definition(dst), bld.def(s1, scc), src1, Operand::c32(1), + Operand::c32(16)); } else if (ctx->program->gfx_level >= GFX9) { bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(dst), src0, src1); } else { diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h index 68a6c686408..31ef17cb2ca 100644 --- a/src/amd/compiler/aco_util.h +++ b/src/amd/compiler/aco_util.h @@ -1046,6 +1046,7 @@ template using bitfield_array64 = bitfield_array; using bitarray8 = bitfield_array; +using bitarray32 = bitfield_array; /* * Resizable array optimized for small lengths. If it's smaller than Size, the elements will be