diff --git a/.gitlab-ci/deqp-radv-navi10-aco-fails.txt b/.gitlab-ci/deqp-radv-navi10-aco-fails.txt
index 1f4a94c4d9c..3c810f18c74 100644
--- a/.gitlab-ci/deqp-radv-navi10-aco-fails.txt
+++ b/.gitlab-ci/deqp-radv-navi10-aco-fails.txt
@@ -1,7 +1,3 @@
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
 dEQP-VK.rasterization.flatshading.line_strip_wide
 dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
 dEQP-VK.rasterization.flatshading.non_strict_lines_wide
diff --git a/.gitlab-ci/deqp-radv-navi14-aco-fails.txt b/.gitlab-ci/deqp-radv-navi14-aco-fails.txt
index 1f4a94c4d9c..3c810f18c74 100644
--- a/.gitlab-ci/deqp-radv-navi14-aco-fails.txt
+++ b/.gitlab-ci/deqp-radv-navi14-aco-fails.txt
@@ -1,7 +1,3 @@
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
 dEQP-VK.rasterization.flatshading.line_strip_wide
 dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
 dEQP-VK.rasterization.flatshading.non_strict_lines_wide
diff --git a/.gitlab-ci/deqp-radv-raven-aco-fails.txt b/.gitlab-ci/deqp-radv-raven-aco-fails.txt
index 1f4a94c4d9c..3c810f18c74 100644
--- a/.gitlab-ci/deqp-radv-raven-aco-fails.txt
+++ b/.gitlab-ci/deqp-radv-raven-aco-fails.txt
@@ -1,7 +1,3 @@
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
 dEQP-VK.rasterization.flatshading.line_strip_wide
 dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
 dEQP-VK.rasterization.flatshading.non_strict_lines_wide
diff --git a/.gitlab-ci/deqp-radv-vega10-aco-fails.txt b/.gitlab-ci/deqp-radv-vega10-aco-fails.txt
index 1f4a94c4d9c..3c810f18c74 100644
--- a/.gitlab-ci/deqp-radv-vega10-aco-fails.txt
+++ b/.gitlab-ci/deqp-radv-vega10-aco-fails.txt
@@ -1,7 +1,3 @@
-# ACO specific issues.
-dEQP-VK.transform_feedback.simple.multistreams_1
-dEQP-VK.transform_feedback.simple.multistreams_3
-
 dEQP-VK.rasterization.flatshading.line_strip_wide
 dEQP-VK.rasterization.flatshading.non_strict_line_strip_wide
 dEQP-VK.rasterization.flatshading.non_strict_lines_wide
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
index 7fdff9baf83..e10358a1cc3 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -552,8 +552,9 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
            ("vopc_e64", [Format.VOPC, Format.VOP3A], 'VOP3A_instruction', itertools.product([1, 2], [2])),
            ("flat", [Format.FLAT], 'FLAT_instruction', [(0, 3), (1, 2)]),
            ("global", [Format.GLOBAL], 'FLAT_instruction', [(0, 3), (1, 2)])]
+formats = [(f if len(f) == 5 else f + ('',)) for f in formats]
 %>\\
-% for name, formats, struct, shapes in formats:
+% for name, formats, struct, shapes, extra_field_setup in formats:
     % for num_definitions, num_operands in shapes:
         <%
         args = ['aco_opcode opcode']
@@ -581,6 +582,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod
             % endfor
             ${f.get_builder_initialization(num_operands)}
         % endfor
+       ${extra_field_setup}
       return insert(instr);
    }
 
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index e8fd15c293a..4d161cf0dbd 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3539,6 +3539,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
    mubuf->barrier = info->barrier;
    mubuf->can_reorder = info->can_reorder;
    mubuf->offset = const_offset;
+   mubuf->swizzled = info->swizzle_component_size != 0;
    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
    mubuf->definitions[0] = Definition(val);
@@ -3990,7 +3991,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un
 }
 
 void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
-                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false,
+                             bool swizzled = false)
 {
    assert(vdata.id());
    assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
@@ -4003,8 +4005,9 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
    Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
-                                 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
-                                 /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+                                 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
+                                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
+                                 /* dlc*/ false, /* slc */ slc);
 
    static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
 }
@@ -4026,7 +4029,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
 
    for (unsigned i = 0; i < write_count; i++) {
       unsigned const_offset = offsets[i] + base_const_offset;
-      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
+      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining);
    }
 }
 
@@ -4838,7 +4841,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
          if (use_mubuf) {
             Instruction *mubuf = bld.mubuf(opcode,
                                            Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_offset, false, true).instr;
+                                           fetch_offset, false, false, true).instr;
             static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
          } else {
             Instruction *mtbuf = bld.mtbuf(opcode,
@@ -6889,7 +6892,7 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
 
    for (unsigned i = 0; i < write_count; i++) {
       aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
+      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
    }
 }
 
@@ -10341,8 +10344,8 @@ static void write_tcs_tess_factors(isel_context *ctx)
       Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
       bld.mubuf(aco_opcode::buffer_store_dword,
                 /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
-                /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
-                /* disable_wqm */ false, /* glc */ true);
+                /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
+                /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
       tf_const_offset += 4;
 
       begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index 8adbd567c01..c1735a0d31a 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1078,7 +1078,8 @@ struct MUBUF_instruction : public Instruction {
    bool lds : 1; /* Return read-data to LDS instead of VGPRs */
    bool disable_wqm : 1; /* Require an exec mask without helper invocations */
    bool can_reorder : 1;
-   uint8_t padding : 2;
+   bool swizzled:1;
+   uint8_t padding : 1;
    barrier_interaction barrier;
 };
 static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 4, "Unexpected padding");
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index 89e30d734f6..46246cbea01 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -88,6 +88,7 @@ class Format(Enum):
       elif self == Format.MUBUF:
          return [('unsigned', 'offset', None),
                  ('bool', 'offen', None),
+                 ('bool', 'swizzled', 'false'),
                  ('bool', 'idxen', 'false'),
                  ('bool', 'addr64', 'false'),
                  ('bool', 'disable_wqm', 'false'),
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index ab38821479e..f66521399ca 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -715,8 +715,11 @@ bool check_vop3_operands(opt_ctx& ctx, unsigned num_operands, Operand *operands)
    return true;
 }
 
-bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset)
+bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp *base, uint32_t *offset, bool prevent_overflow)
 {
+   if (prevent_overflow)
+      return false; //TODO
+
    Operand op = instr->operands[op_index];
 
    if (!op.isTemp())
@@ -754,7 +757,7 @@ bool parse_base_offset(opt_ctx &ctx, Instruction* instr, unsigned op_index, Temp
          continue;
 
       uint32_t offset2 = 0;
-      if (parse_base_offset(ctx, add_instr, !i, base, &offset2)) {
+      if (parse_base_offset(ctx, add_instr, !i, base, &offset2, prevent_overflow)) {
          *offset += offset2;
       } else {
          *base = add_instr->operands[!i].getTemp();
@@ -927,6 +930,15 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
          while (info.is_temp())
             info = ctx.info[info.temp.id()];
 
+         /* According to AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(), vaddr
+          * overflow for scratch accesses works only on GFX9+ and saddr overflow
+          * never works. Since swizzling is the only thing that separates
+          * scratch accesses and other accesses and swizzling changing how
+          * addressing works significantly, this probably applies to swizzled
+          * MUBUF accesses. */
+         bool vaddr_prevent_overflow = mubuf->swizzled && ctx.program->chip_class < GFX9;
+         bool saddr_prevent_overflow = mubuf->swizzled;
+
          if (mubuf->offen && i == 1 && info.is_constant_or_literal(32) && mubuf->offset + info.val < 4096) {
             assert(!mubuf->idxen);
             instr->operands[1] = Operand(v1);
@@ -937,12 +949,14 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
             instr->operands[2] = Operand((uint32_t) 0);
             mubuf->offset += info.val;
             continue;
-         } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == v1 && mubuf->offset + offset < 4096) {
+         } else if (mubuf->offen && i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, vaddr_prevent_overflow) &&
+                    base.regClass() == v1 && mubuf->offset + offset < 4096) {
             assert(!mubuf->idxen);
             instr->operands[1].setTemp(base);
             mubuf->offset += offset;
             continue;
-         } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && mubuf->offset + offset < 4096) {
+         } else if (i == 2 && parse_base_offset(ctx, instr.get(), i, &base, &offset, saddr_prevent_overflow) &&
+                    base.regClass() == s1 && mubuf->offset + offset < 4096) {
             instr->operands[i].setTemp(base);
             mubuf->offset += offset;
             continue;
@@ -957,7 +971,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
          uint32_t offset;
          bool has_usable_ds_offset = ctx.program->chip_class >= GFX7;
          if (has_usable_ds_offset &&
-             i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset) &&
+             i == 0 && parse_base_offset(ctx, instr.get(), i, &base, &offset, false) &&
              base.regClass() == instr->operands[i].regClass() &&
              instr->opcode != aco_opcode::ds_swizzle_b32) {
             if (instr->opcode == aco_opcode::ds_write2_b32 || instr->opcode == aco_opcode::ds_read2_b32 ||
@@ -993,7 +1007,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr)
               (ctx.program->chip_class >= GFX8 && info.val <= 0xFFFFF))) {
             instr->operands[i] = Operand(info.val);
             continue;
-         } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
+         } else if (i == 1 && parse_base_offset(ctx, instr.get(), i, &base, &offset, true) && base.regClass() == s1 && offset <= 0xFFFFF && ctx.program->chip_class >= GFX9) {
             bool soe = smem->operands.size() >= (!smem->definitions.empty() ? 3 : 4);
             if (soe &&
                 (!ctx.info[smem->operands.back().tempId()].is_constant_or_literal(32) ||
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
index c00f6466a7d..d472f63ec57 100644
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -1566,9 +1566,9 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                      split->definitions[i] = bld.def(v1);
                   bld.insert(split);
                   for (unsigned i = 0; i < temp.size(); i++)
-                     bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false);
+                     bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true);
                } else {
-                  bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false);
+                  bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true);
                }
             } else {
                ctx.program->config->spilled_sgprs += (*it)->operands[0].size();
@@ -1632,11 +1632,11 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) {
                   for (unsigned i = 0; i < def.size(); i++) {
                      Temp tmp = bld.tmp(v1);
                      vec->operands[i] = Operand(tmp);
-                     bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false);
+                     bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true);
                   }
                   bld.insert(vec);
                } else {
-                  bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false);
+                  bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true);
                }
             } else {
                uint32_t spill_slot = slots[spill_id];