From c903f33b3687b6561d80f247653c071ccd5b8198 Mon Sep 17 00:00:00 2001 From: Gert Wollny Date: Tue, 19 Aug 2025 15:46:12 +0200 Subject: [PATCH] r600/sfn: preloading sources for fp64 ops with common code path These ops require more than three source values that could result in readport conflicts, so check the readport configuration before issuing alu operations and inject copy operations when conflicts are identified. Signed-off-by: Gert Wollny Part-of: --- .../drivers/r600/sfn/sfn_instr_alu.cpp | 90 +++++++++++++++---- 1 file changed, 71 insertions(+), 19 deletions(-) diff --git a/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp b/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp index 3f7e6d7bfdb..d62808b277c 100644 --- a/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_instr_alu.cpp @@ -2058,6 +2058,56 @@ emit_alu_fsat64(const nir_alu_instr& alu, Shader& shader) return true; } +using PreloadSources = std::array, 4>; + +void +preload_sources(const nir_alu_instr& alu, + Shader& shader, + PreloadSources& src, + unsigned distinct_slopts_per_instructions, + unsigned nsrc) +{ + auto& value_factory = shader.value_factory(); + + AluReadportReservation rr_all; + int chan_used[4] = {0}; + + for (unsigned i = 0; i < alu.def.num_components; ++i) { + for (unsigned slot = 0; slot < distinct_slopts_per_instructions; slot++) { + unsigned row = slot + i * distinct_slopts_per_instructions; + for (unsigned col = 0; col < nsrc; col++) { + src[row][col] = value_factory.src64(alu.src[col], 0, slot); + if (auto r = src[row][col]->as_register()) + ++chan_used[r->chan()]; + } + + while (1) { + AluReadportReservation rr = rr_all; + unsigned failing_source = nsrc; + for (auto bs = alu_vec_012; bs != alu_vec_unknown; ++bs) { + failing_source = rr_all.schedule_vec_src(src[row], nsrc, bs); + if (failing_source == nsrc) + break; + } + if (failing_source == nsrc) { + rr_all = rr; + break; + } + + int new_chan = 0; + for (int i = 0; i < 4; ++i) { + if (chan_used[i] < 3) { + new_chan = i; + break; + } + } + src[row][failing_source] = + shader.emit_load_to_register(src[row][failing_source], new_chan); + } + } + } +} + static bool emit_alu_op2_64bit(const nir_alu_instr& alu, EAluOp opcode, Shader& shader) { @@ -2067,13 +2117,8 @@ emit_alu_op2_64bit(const nir_alu_instr& alu, EAluOp opcode, Shader& shader) int num_emit0 = opcode == op2_mul_64 ? 3 : 1; - std::array,2> tmp; - for (unsigned k = 0; k < alu.def.num_components; ++k) { - tmp[k][0] = shader.emit_load_to_register(value_factory.src64(alu.src[0], k, 1), 0); - tmp[k][1] = shader.emit_load_to_register(value_factory.src64(alu.src[1], k, 1), 1); - tmp[k][2] = shader.emit_load_to_register(value_factory.src64(alu.src[0], k, 0), 2); - tmp[k][3] = shader.emit_load_to_register(value_factory.src64(alu.src[1], k, 0), 3); - } + PreloadSources tmp; + preload_sources(alu, shader, tmp, 2, 2); assert(num_emit0 == 1 || alu.def.num_components == 1); @@ -2085,8 +2130,8 @@ emit_alu_op2_64bit(const nir_alu_instr& alu, EAluOp opcode, Shader& shader) ir = new AluInstr(opcode, dest, - tmp[k][0], - tmp[k][1], + tmp[2 * k + 1][0], + tmp[2 * k + 1][1], i < 2 ? AluInstr::write : AluInstr::empty); group->add_instruction(ir); } @@ -2096,8 +2141,8 @@ emit_alu_op2_64bit(const nir_alu_instr& alu, EAluOp opcode, Shader& shader) ir = new AluInstr(opcode, dest, - tmp[k][2], - tmp[k][3], + tmp[2 * k][0], + tmp[2 * k][1], i == 1 ? AluInstr::write : AluInstr::empty); group->add_instruction(ir); } @@ -2120,16 +2165,18 @@ emit_alu_op2_64bit_one_dst(const nir_alu_instr& alu, order[1] = 0; } - AluInstr::SrcValues src(4); + PreloadSources tmp; + preload_sources(alu, shader, tmp, 2, 2); auto chan_mask = get_dest_mask(opcode, 2, false); for (unsigned k = 0; k < alu.def.num_components; ++k) { auto dest = value_factory.dest(alu.def, 2 * k, pin_free, chan_mask); - src[0] = value_factory.src64(alu.src[order[0]], k, 1); - src[1] = value_factory.src64(alu.src[order[1]], k, 1); - src[2] = value_factory.src64(alu.src[order[0]], k, 0); - src[3] = value_factory.src64(alu.src[order[1]], k, 0); + + AluInstr::SrcValues src(4); + for (unsigned i = 0; i < 2; ++i) + for (unsigned j = 0; j < 2; ++j) + src[j + 2 * i] = tmp[1 - i + 2 * k][order[j]]; ir = new AluInstr(opcode, dest, src, AluInstr::write, 2); ir->set_alu_flag(alu_64bit_op); @@ -2168,6 +2215,11 @@ emit_alu_fma_64bit(const nir_alu_instr& alu, EAluOp opcode, Shader& shader) auto& value_factory = shader.value_factory(); auto group = new AluGroup(); AluInstr *ir = nullptr; + + std::array, 4> src; + + preload_sources(alu, shader, src, 2, 3); + for (unsigned i = 0; i < 4; ++i) { int chan = i < 3 ? 1 : 0; @@ -2176,9 +2228,9 @@ emit_alu_fma_64bit(const nir_alu_instr& alu, EAluOp opcode, Shader& shader) ir = new AluInstr(opcode, dest, - value_factory.src64(alu.src[0], 0, chan), - value_factory.src64(alu.src[1], 0, chan), - value_factory.src64(alu.src[2], 0, chan), + src[chan][0], + src[chan][1], + src[chan][2], i < 2 ? AluInstr::write : AluInstr::empty); if (!group->add_instruction(ir)) UNREACHABLE("Unable to emit group, likely because of a readport conflict\n");