diff --git a/.gitlab-ci/deqp-radv-navi10-aco-fails.txt b/.gitlab-ci/deqp-radv-navi10-aco-fails.txt index 1f4a94c4d9c..3c810f18c74 100644 --- a/.gitlab-ci/deqp-radv-navi10-aco-fails.txt +++ b/.gitlab-ci/deqp-radv-navi10-aco-fails.txt @@ -1,7 +1,3 @@ -# ACO specific issues. -dEQP-VK.transform_feedback.simple.multistreams_1 -dEQP-VK.transform_feedback.simple.multistreams_3 - dEQP-VK.rasterization.flatshading.line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_lines_wide diff --git a/.gitlab-ci/deqp-radv-navi14-aco-fails.txt b/.gitlab-ci/deqp-radv-navi14-aco-fails.txt index 1f4a94c4d9c..3c810f18c74 100644 --- a/.gitlab-ci/deqp-radv-navi14-aco-fails.txt +++ b/.gitlab-ci/deqp-radv-navi14-aco-fails.txt @@ -1,7 +1,3 @@ -# ACO specific issues. -dEQP-VK.transform_feedback.simple.multistreams_1 -dEQP-VK.transform_feedback.simple.multistreams_3 - dEQP-VK.rasterization.flatshading.line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_lines_wide diff --git a/.gitlab-ci/deqp-radv-raven-aco-fails.txt b/.gitlab-ci/deqp-radv-raven-aco-fails.txt index 1f4a94c4d9c..3c810f18c74 100644 --- a/.gitlab-ci/deqp-radv-raven-aco-fails.txt +++ b/.gitlab-ci/deqp-radv-raven-aco-fails.txt @@ -1,7 +1,3 @@ -# ACO specific issues. -dEQP-VK.transform_feedback.simple.multistreams_1 -dEQP-VK.transform_feedback.simple.multistreams_3 - dEQP-VK.rasterization.flatshading.line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_lines_wide diff --git a/.gitlab-ci/deqp-radv-vega10-aco-fails.txt b/.gitlab-ci/deqp-radv-vega10-aco-fails.txt index 1f4a94c4d9c..3c810f18c74 100644 --- a/.gitlab-ci/deqp-radv-vega10-aco-fails.txt +++ b/.gitlab-ci/deqp-radv-vega10-aco-fails.txt @@ -1,7 +1,3 @@ -# ACO specific issues. -dEQP-VK.transform_feedback.simple.multistreams_1 -dEQP-VK.transform_feedback.simple.multistreams_3 - dEQP-VK.rasterization.flatshading.line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide dEQP-VK.rasterization.flatshading.non_strict_lines_wide diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index 7fdff9baf83..e10358a1cc3 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -552,8 +552,9 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod ("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])), ("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]), ("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])] +formats = [(f if len(f) == 5 else f + ('',)) for f in formats] %>\\ -% for name, formats, struct, shapes in formats: +% for name, formats, struct, shapes, extra_field_setup in formats: % for num_definitions, num_operands in shapes: <% args = ['aco_opcode opcode'] @@ -581,6 +582,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod % endfor ${f.get_builder_initialization(num_operands)} % endfor + ${extra_field_setup} return insert(instr); } diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index e8fd15c293a..4d161cf0dbd 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3539,6 +3539,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info, mubuf->barrier = info->barrier; mubuf->can_reorder = info->can_reorder; mubuf->offset = const_offset; + mubuf->swizzled = info->swizzle_component_size != 0; RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4)); Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); mubuf->definitions[0] = Definition(val); @@ -3990,7 +3991,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un } void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, - unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false) + unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false, + bool swizzled = false) { assert(vdata.id()); assert(vdata.size() != 3 || ctx->program->chip_class != GFX6); @@ -4003,8 +4005,9 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u); Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, - /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false, - /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc); + /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled, + /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true, + /* dlc*/ false, /* slc */ slc); static_cast(r.instr)->can_reorder = allow_reorder; } @@ -4026,7 +4029,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset for (unsigned i = 0; i < write_count; i++) { unsigned const_offset = offsets[i] + base_const_offset; - emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc); + emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining); } } @@ -4838,7 +4841,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr) if (use_mubuf) { Instruction *mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index, soffset, - fetch_offset, false, true).instr; + fetch_offset, false, false, true).instr; static_cast(mubuf)->can_reorder = true; } else { Instruction *mtbuf = bld.mtbuf(opcode, @@ -6889,7 +6892,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) { for (unsigned i = 0; i < write_count; i++) { aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes()); - bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true); + bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true); } } @@ -10341,8 +10344,8 @@ static void write_tcs_tess_factors(isel_context *ctx) Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u)); bld.mubuf(aco_opcode::buffer_store_dword, /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word, - /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false, - /* disable_wqm */ false, /* glc */ true); + /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false, + /* addr64 */ false, /* disable_wqm */ false, /* glc */ true); tf_const_offset += 4; begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero); diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h index 8adbd567c01..c1735a0d31a 100644 --- a/src/amd/compiler/aco_ir.h +++ b/src/amd/compiler/aco_ir.h @@ -1078,7 +1078,8 @@ struct MUBUF_instruction : public Instruction { bool lds : 1; /* Return read-data to LDS instead of VGPRs */ bool disable_wqm : 1; /* Require an exec mask without helper invocations */ bool can_reorder : 1; - uint8_t padding : 2; + bool swizzled:1; + uint8_t padding : 1; barrier_interaction barrier; }; static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 89e30d734f6..46246cbea01 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -88,6 +88,7 @@ class Format(Enum): elif self == Format.MUBUF: return [('unsigned', 'offset', None), ('bool', 'offen', None), + ('bool', 'swizzled', 'false'), ('bool', 'idxen', 'false'), ('bool', 'addr64', 'false'), ('bool', 'disable_wqm', 'false'), diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index ab38821479e..f66521399ca 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -715,8 +715,11 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands) return true; } -bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset) +bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow) { + if (prevent_overflow) + return false; //TODO + Operand op = instr->operands[op_index]; if (!op.isTemp()) @@ -754,7 +757,7 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp continue; uint32_t offset2 = 0; - if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) { + if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) { *offset += offset2; } else { *base = add_instr->operands[!i].getTemp(); @@ -927,6 +930,15 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) while (info.is_temp()) info = ctx.info[info.temp.id()]; + /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr + * overflow for scratch accesses works only on GFX9+ and saddr overflow + * never works. Since swizzling is the only thing that separates + * scratch accesses and other accesses and swizzling changing how + * addressing works significantly, this probably applies to swizzled + * MUBUF accesses. */ + bool vaddr_prevent_overflow = mubuf->swizzled && ctx.program->chip_class < GFX9; + bool saddr_prevent_overflow = mubuf->swizzled; + if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) { assert(!mubuf->idxen); instr->operands[1] = Operand(v1); @@ -937,12 +949,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) instr->operands[2] = Operand((uint32_t) 0); mubuf->offset += info.val; continue; - } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) { + } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) && + base.regClass() == v1 && mubuf->offset + offset < 4096) { assert(!mubuf->idxen); instr->operands[1].setTemp(base); mubuf->offset += offset; continue; - } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) { + } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) && + base.regClass() == s1 && mubuf->offset + offset < 4096) { instr->operands[i].setTemp(base); mubuf->offset += offset; continue; @@ -957,7 +971,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) uint32_t offset; bool has_usable_ds_offset = ctx.program->chip_class >= GFX7; if (has_usable_ds_offset && - i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && + i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) && base.regClass() == instr->operands[i].regClass() && instr->opcode != aco_opcode::ds_swizzle_b32) { if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 || @@ -993,7 +1007,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) { instr->operands[i] = Operand(info.val); continue; - } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { + } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) { bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4); if (soe && (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) || diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index c00f6466a7d..d472f63ec57 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -1566,9 +1566,9 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { split->definitions[i] = bld.def(v1); bld.insert(split); for (unsigned i = 0; i < temp.size(); i++) - bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false); + bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true); } else { - bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false); + bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true); } } else { ctx.program->config->spilled_sgprs += (*it)->operands[0].size(); @@ -1632,11 +1632,11 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { for (unsigned i = 0; i < def.size(); i++) { Temp tmp = bld.tmp(v1); vec->operands[i] = Operand(tmp); - bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false); + bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true); } bld.insert(vec); } else { - bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false); + bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true); } } else { uint32_t spill_slot = slots[spill_id];