VERSION: bump for 26.0.5

docs: add release notes for 26.0.5
ir3/ra: fix copy-paste error
2026-05-09 02:28:10 +02:00 · 2026-04-15 16:20:41 +02:00 · 2026-04-15 16:20:41 +02:00 · 2026-04-14 15:27:47 +02:00 · 2026-04-14 15:27:47 +02:00 · 2026-04-14 15:27:47 +02:00
529 changed files with 50927 additions and 5656 deletions
--- a/.ci-farms-disabled/lima
+++ b/.ci-farms-disabled/lima
--- a/.clang-format
+++ b/.clang-format
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -241,6 +241,7 @@ include:
    # changed, else we'll just use the already-built containers
    - if: *is-merge-attempt
      changes: &image_tags_path
        - .gitlab-ci.yml
        - .gitlab-ci/image-tags.yml
      when: on_success
    # Skip everything for pre-merge and merge pipelines which don't change
--- a/.gitlab-ci/build/gitlab-ci.yml
+++ b/.gitlab-ci/build/gitlab-ci.yml
@ -774,7 +774,7 @@ debian-riscv64:
 # While s390 is dead, s390x is very much alive, and one of the last major
 # big-endian platforms, so it provides useful coverage.
 # In case of issues with this job, contact @ajax
-debian-s390x:
+.debian-s390x:
  extends:
    - .meson-cross
    - .use-debian/s390x_build
@ -789,7 +789,7 @@ debian-s390x:
    DRI_LOADERS:
      -D glvnd=disabled
-debian-ppc64el:
+.debian-ppc64el:
  extends:
    - .meson-cross
    - .use-debian/ppc64el_build
--- a/.gitlab-ci/run-shader-db.sh
+++ b/.gitlab-ci/run-shader-db.sh
@ -14,7 +14,7 @@ export LD_LIBRARY_PATH=$LIBDIR
 cd /usr/local/shader-db
-for driver in freedreno intel lima v3d vc4; do
+for driver in freedreno lima v3d vc4; do
    section_start shader-db-${driver} "Running shader-db for $driver"
    env LD_PRELOAD="$LIBDIR/lib${driver}_noop_drm_shim.so" \
        ./run -j"${FDO_CI_CONCURRENT:-4}" ./shaders \
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@ -1 +1 @@
-26.0.0-devel
+26.0.5
--- a/bin/gen_release_notes.py
+++ b/bin/gen_release_notes.py
@ -385,5 +385,5 @@ async def main() -> None:
 if __name__ == "__main__":
-    loop = asyncio.get_event_loop()
+    loop = asyncio.new_event_loop()
    loop.run_until_complete(main())
--- a/bin/pick-ui.py
+++ b/bin/pick-ui.py
@ -27,7 +27,9 @@ from pick.ui import UI, PALETTE
 if __name__ == "__main__":
    u = UI()
-    evl = urwid.AsyncioEventLoop(loop=asyncio.new_event_loop())
+    asyncio_loop = asyncio.new_event_loop()
    asyncio.set_event_loop(asyncio_loop)
    evl = urwid.AsyncioEventLoop(loop=asyncio_loop)
    loop = urwid.MainLoop(u.render(), PALETTE, event_loop=evl, handle_mouse=False)
    u.mainloop = loop
    loop.run()
--- a/bin/pick/core.py
+++ b/bin/pick/core.py
@ -52,7 +52,7 @@ IS_FIX = re.compile(r'^\s*fixes:\s*([a-f0-9]{6,40})', flags=re.MULTILINE | re.IG
 IS_CC = re.compile(r'^\s*cc:\s*["\']?([0-9]{2}\.[0-9])?["\']?\s*["\']?([0-9]{2}\.[0-9])?["\']?\s*\<?mesa-stable',
                   flags=re.MULTILINE | re.IGNORECASE)
 IS_REVERT = re.compile(r'This reverts commit ([0-9a-f]{40})')
-IS_BACKPORT = re.compile(r'^\s*backport-to:\s*(\d{2}\.\d),?\s*(\d{2}\.\d)?',
+IS_BACKPORT = re.compile(r'^\s*backport-to:\s*(?:(\d{2}\.\d),?\s*(\d{2}\.\d)?|(\*))',
                         flags=re.MULTILINE | re.IGNORECASE)
 # XXX: hack
@ -295,7 +295,7 @@ async def resolve_nomination(commit: 'Commit', version: str) -> 'Commit':
    if backport_to := IS_BACKPORT.findall(commit_message):
        for match in backport_to:
-            if any(Version(version) >= Version(backport_version)
+            if any(backport_version == '*' or Version(version) >= Version(backport_version)
                   for backport_version in match if backport_version != ''):
                commit.nominated = True
                commit.nomination_type = NominationType.BACKPORT
--- a/bin/pick/core_test.py
+++ b/bin/pick/core_test.py
@ -263,7 +263,7 @@ class TestRE:
            """)
            backport_to = core.IS_BACKPORT.findall(message)
-            assert backport_to == [('19.2', '')]
+            assert backport_to == [('19.2', '', '')]
        def test_multiple_release_space(self):
            """Tests commit with more than one branch specified"""
@ -278,7 +278,7 @@ class TestRE:
            """)
            backport_to = core.IS_BACKPORT.findall(message)
-            assert backport_to == [('19.1', '19.2')]
+            assert backport_to == [('19.1', '19.2', '')]
        def test_multiple_release_comma(self):
            """Tests commit with more than one branch specified"""
@ -293,7 +293,7 @@ class TestRE:
            """)
            backport_to = core.IS_BACKPORT.findall(message)
-            assert backport_to == [('19.1', '19.2')]
+            assert backport_to == [('19.1', '19.2', '')]
        def test_multiple_release_lines(self):
            """Tests commit with more than one branch specified in mulitple tags"""
@ -305,7 +305,7 @@ class TestRE:
            """)
            backport_to = core.IS_BACKPORT.findall(message)
-            assert backport_to == [('19.0', ''), ('19.1', '19.2')]
+            assert backport_to == [('19.0', '', ''), ('19.1', '19.2', '')]
 class TestResolveNomination:
@ -405,6 +405,17 @@ class TestResolveNomination:
        assert c.nominated
        assert c.nomination_type is core.NominationType.BACKPORT
    @pytest.mark.asyncio
    async def test_backport_all_is_nominated(self):
        s = self.FakeSubprocess(b'Backport-to: *')
        c = core.Commit('abcdef1234567890', 'a commit')
        with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock):
            await core.resolve_nomination(c, '0.0')
        assert c.nominated
        assert c.nomination_type is core.NominationType.BACKPORT
    @pytest.mark.asyncio
    async def test_backport_is_nominated_after(self):
        s = self.FakeSubprocess(b'Backport-to: 16.2')
--- a/bin/pick/requirements.txt
+++ b/bin/pick/requirements.txt
@ -1,3 +1,3 @@
-attrs==23.1.0
+attrs==25.4.0
-packaging==25.0
+packaging==26.0
-urwid==2.1.2
+urwid==3.0.3
--- a/bin/pick/ui.py
+++ b/bin/pick/ui.py
@ -224,6 +224,7 @@ class UI:
            if commit.nominated and commit.resolution is core.Resolution.UNRESOLVED:
                b = urwid.AttrMap(CommitWidget(self, commit), None, focus_map='reversed')
                self.commit_list.append(b)
        self.mainloop.draw_screen()
        self.save()
    async def feedback(self, text: str) -> None:
@ -236,6 +237,7 @@ class UI:
            if c.base_widget is commit:
                del self.commit_list[i]
                break
        self.mainloop.draw_screen()
    def save(self):
        core.save(itertools.chain(self.new_commits, self.previous_commits))
@ -246,6 +248,7 @@ class UI:
        def reset_cb(_) -> None:
            self.mainloop.widget = o
            self.mainloop.draw_screen()
        async def apply_cb(edit: urwid.Edit) -> None:
            text: str = edit.get_edit_text()
@ -263,6 +266,7 @@ class UI:
                raise RuntimeError(f"Couldn't find {sha}")
            await commit.apply(self)
            self.mainloop.draw_screen()
        q = urwid.Edit("Commit sha\n")
        ok_btn = urwid.Button('Ok')
@ -279,12 +283,14 @@ class UI:
        self.mainloop.widget = urwid.Overlay(
            urwid.Filler(box), o, 'center', ('relative', 50), 'middle', ('relative', 50)
        )
        self.mainloop.draw_screen()
    def chp_failed(self, commit: 'CommitWidget', err: str) -> None:
        o = self.mainloop.widget
        def reset_cb(_) -> None:
            self.mainloop.widget = o
            self.mainloop.draw_screen()
        t = urwid.Text(textwrap.dedent(f"""
            Failed to apply {commit.commit.sha} {commit.commit.description} with the following error:
@ -313,3 +319,4 @@ class UI:
        self.mainloop.widget = urwid.Overlay(
            urwid.Filler(box), o, 'center', ('relative', 50), 'middle', ('relative', 50)
        )
        self.mainloop.draw_screen()
--- a/docs/relnotes.rst
+++ b/docs/relnotes.rst
@ -3,6 +3,12 @@ Release Notes
 The release notes summarize what's new or changed in each Mesa release.
 -  :doc:`26.0.5 release notes <relnotes/26.0.5>`
 -  :doc:`26.0.4 release notes <relnotes/26.0.4>`
 -  :doc:`26.0.3 release notes <relnotes/26.0.3>`
 -  :doc:`26.0.2 release notes <relnotes/26.0.2>`
 -  :doc:`26.0.1 release notes <relnotes/26.0.1>`
 -  :doc:`26.0.0 release notes <relnotes/26.0.0>`
 -  :doc:`25.3.3 release notes <relnotes/25.3.3>`
 -  :doc:`25.3.2 release notes <relnotes/25.3.2>`
 -  :doc:`25.2.8 release notes <relnotes/25.2.8>`
@ -473,6 +479,12 @@ The release notes summarize what's new or changed in each Mesa release.
   :maxdepth: 1
   :hidden:
   26.0.5 <relnotes/26.0.5>
   26.0.4 <relnotes/26.0.4>
   26.0.3 <relnotes/26.0.3>
   26.0.2 <relnotes/26.0.2>
   26.0.1 <relnotes/26.0.1>
   26.0.0 <relnotes/26.0.0>
   25.3.3 <relnotes/25.3.3>
   25.3.2 <relnotes/25.3.2>
   25.2.8 <relnotes/25.2.8>
--- a/docs/relnotes/26.0.0.rst
+++ b/docs/relnotes/26.0.0.rst
--- a/docs/relnotes/26.0.1.rst
+++ b/docs/relnotes/26.0.1.rst
@ -0,0 +1,247 @@
 Mesa 26.0.1 Release Notes / 2026-02-25
 ======================================
 Mesa 26.0.1 is a bug fix release which fixes bugs found since the 26.0.0 release.
 Mesa 26.0.1 implements the OpenGL 4.6 API, but the version reported by
 glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
 glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
 Some drivers don't support all the features required in OpenGL 4.6. OpenGL
 4.6 is **only** available if requested at context creation.
 Compatibility contexts may report a lower version depending on each driver.
 Mesa 26.0.1 implements the Vulkan 1.4 API, but the version reported by
 the apiVersion property of the VkPhysicalDeviceProperties struct
 depends on the particular driver being used.
 SHA checksums
 -------------
 ::
    SHA256: bb5104f9f9a46c9b5175c24e601e0ef1ab44ce2d0fdbe81548b59adc8b385dcc  mesa-26.0.1.tar.xz
    SHA512: d47072257035acfa8a5594c0cda831b4e5178169dea8a06c6657268a441e32271f8798486e837cea23f35ce3f0b4b9520a4ea4ed26b0e1267b02da4c649bc9f9  mesa-26.0.1.tar.xz
 New features
 ------------
 - None
 Bug fixes
 ---------
 - Missing Haswell case after a097a3d214eda7fb7b9ff63176754b7260e09e03 leads to bogus assert in intel_perf_mdapi.c
 - Question: Does building Lavapipe on Windows require building "microsoft-experimental" as well?
 - [ANV]: Regression in dxvk Greedfall
 - [ANV][BMG] Building Mesa with Clang causes Missing Skin Textures in UE games - Tekken 8
 - [ANV][DG2][Regression]: Flickering water "boxes" in Civilization VII
 - [RADV] Killer7 has a blue tint with RDNA3/4
 - [bisected] Xe3 regression with piglit tess/barrier-patch.shader_test after cmod prop change
 - [radeonsi] Regression: GL_FEEDBACK returns 0.0 for X-coordinates (Legacy GL)
 - anv, bisected: Genshin Impact wrong shadows, flickering grass
 - turnip: llama.cpp: Running test-backend-ops results in segmentation fault
 - venus crashes in vn_CreateDevice() with latest mesa/main [bisected]
 Changes
 -------
 Aitor Camacho (7):
 - wsi/metal: Expose additional color spaces if instance extension enabled
 - kk: Fill pipelineUUID
 - kk: Fix shader uint32_t value serialization
 - kk: Correctly release pipeline handles at shader destroy
 - kk: Fix compute pipeline cache
 - kk: Move gfx pipeline data to the info struct within kk_shader
 - kk: Fix graphics pipeline serialization
 Alyssa Rosenzweig (1):
 - brw: drop buggy SLM optimization
 Anna Maniscalco (1):
 - freedreno/common: set has_astc_hdr true for a7xx targets
 Benjamin Otte (1):
 - lavapipe: Fix features for nonsubsampled ycbcr formats
 Daniel Schürmann (1):
 - nir/clone: Fix cloning indirect call instructions
 Danylo Piliaiev (1):
 - ir3: Align TCS per-patch output to 64 bytes to prevent stale reads
 Emma Anholt (1):
 - ir3/ra: Fix DOUBLE_ONLY limit pressure computation.
 Eric Engestrom (5):
 - docs: add sha sum for 26.0.0
 - .pick_status.json: Update to 03d2cc2b2ae5341409ee1fab74e98134a6df0511
 - bin/gen_release_notes: fix support for python 3.14
 - pick-ui: add \`Backport-to: \*` as a synonym to \`Cc: mesa-stable`
 - .pick_status.json: Mark 7dd7731ac710b0c7213f6bb466b33f62eca80604 as denominated
 Faith Ekstrand (6):
 - pan/clear: Stop packing undefined bits in colors
 - nir/gather_info: Add support for panfrost tile load/store intrinsics
 - panvk: Create both Z/S descriptors, even for separate Z/S
 - panvk/preload: Stop assuming 32 registers
 - panvk/jm: Refactor BeginRendering()
 - panvk: Also load output attachments with LOAD_OP_NONE+STORE_OP_NONE
 Frank Binns (2):
 - pvr/ci: move some timing out tests from fails to skips
 - pvr: Fix alloc callbacks usage when freeing frame buffers
 Ian Romanick (8):
 - spirv: Use STACK_ARRAY instead of NIR_VLA
 - nir: Use STACK_ARRAY instead of NIR_VLA
 - brw: Call nir_opt_algebraic_late in brw_nir_create_raygen_trampoline
 - brw: Call nir_opt_algebraic_late later in brw_postprocess_nir_opts
 - elk: Call nir_opt_algebraic_late in elk_postprocess_nir
 - brw/cmod: Don't propagate from CMP to ADD if there is a write between
 - elk/cmod: Don't propagate from CMP to possible Inf + (-Inf)
 - elk/cmod: Don't propagate from CMP to ADD if there is a write between
 Janne Grunau (3):
 - asahi: Use GPU for buffer copies in resource_copy_region()
 - asahi: Implement clear_buffer using libagx_fill*
 - hk: Use aligned vector fill in hk_CmdFillBuffer if possible
 Jarred Davies (2):
 - pvr: Fix allocating the required scratch buffer space for tile buffers
 - pvr: Add missing support for tile buffers to SPM EOT programs
 Jesse Natalie (1):
 - meson: Include DirectX-Headers dependency for all VK Windows builds
 Jianxun Zhang (1):
 - anv: Limit modifier disabling workaround to specific GTK versions
 José Roberto de Souza (1):
 - intel/perf: Add HSW verx10 to intel_perf_query_result_write_mdapi()
 Juston Li (1):
 - anv: set missing protected bit for protected depth/stencil surfaces
 Konstantin Seurer (2):
 - radv: Fix setting the viewport for depth stencil FS resolves
 - vulkan/cmd_queue: Fixup stride for multi draws
 Lars-Ivar Hesselberg Simonsen (2):
 - panvk: Fix dcd_flags1 dirty bit
 - pan/genxml/v13: Fix HSR Prepass typo
 Leon Perianu (1):
 - pvr: fix format table properties duplicate
 Lionel Landwerlin (8):
 - anv: flush render caches on first pipeline select
 - anv: fix nested command buffer relocations
 - anv: add missing constant cache invalidation for descriptor buffers
 - isl: fix 32bit math with 4GB buffer size
 - anv: apply the same ccs disabling for Xe3 than Xe2
 - anv: disable ccs modifier reporting when ccs modifiers are disabled
 - anv: dirty descriptors after blorp operations
 - anv: remove snprintf for aux op transition
 Mary Guillemard (1):
 - hk: Fix crash in hk_handle_passthrough_gs
 Matt Turner (4):
 - brw/cse: fix \`operands_match` corrupting non-IMM register data
 - brw/cse: use copies in \`operands_match` instead of in-place modification
 - elk/cse: fix \`operands_match` corrupting non-IMM register data
 - elk/cse: use copies in \`operands_match` instead of in-place modification
 Mike Blumenkrantz (2):
 - zink: fix broken compiler assert
 - zink: only do pre-sync transfer barrier after a renderpass
 Natalie Vock (3):
 - radv/rt: Only use ds_bvh_stack_rtn if the stack base is possible to encode
 - radv: Initialize nir_lower_io_to_scalar progress variable
 - radv/nir: Correctly handle workgroup sizes not aligned to 32
 Nick Hamilton (5):
 - pvr: Fix incorrect subpass merging optimisation
 - pvr: Rename pvr_render_input_attachment
 - pvr: Add missing support for preserve attachments
 - pvr: Update CI fails list after render pass fixes
 - pvr: Add support for fragment pass through shader
 Olivia Lee (1):
 - hk: fix passthrough GS key invalidation
 Pavel Ondračka (2):
 - r300: align macro-tiled stride-addressed textures in X
 - mesa: implement FRAMEBUFFER_RENDERABLE internalformat query
 Rhys Perry (3):
 - aco: fix gfx6-8 store_scratch() with function calls
 - aco: reset all vgpr_used_by_vmem\_ in resolve_all_gfx11
 - aco: resolve hazards before calls
 Robert Mader (1):
 - lavapipe: enable dmabuf import for planar drm formats
 Ryan Zhang (1):
 - panvk: guard against NULL pointers to avoid crash
 Samuel Pitoiset (5):
 - ac,radv,radeonsi: use correct swizzle/pitch for depth-only images with SDMA
 - radv: fix potential corruption after FMASK decompression on GFX6-8
 - radv/meta: fix depth/stencil resolves with different regions
 - ac/nir: fix writemask for dual source blending on GFX11+
 - radv: fix potential GPU hangs with secondaries on transfer queue
 Tapani Pälli (1):
 - util: bring back fix to avoid strict aliasing bugs in xxhash
 Timothy Arceri (2):
 - mesa: add _mesa_lookup_state_param_idx() helper
 - st/glsl_to_nir: make sure the variant has the correct locations set
 Wei Hao (1):
 - radeonsi: fix threaded shader compilation finishing after context is destroyed
 Yiwei Zhang (2):
 - venus: workaround a gcc-15 dead store elimination (DSE) bug
 - venus: sync protocol for strict aliasing compliance
--- a/docs/relnotes/26.0.2.rst
+++ b/docs/relnotes/26.0.2.rst
@ -0,0 +1,239 @@
 Mesa 26.0.2 Release Notes / 2026-03-12
 ======================================
 Mesa 26.0.2 is a bug fix release which fixes bugs found since the 26.0.1 release.
 Mesa 26.0.2 implements the OpenGL 4.6 API, but the version reported by
 glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
 glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
 Some drivers don't support all the features required in OpenGL 4.6. OpenGL
 4.6 is **only** available if requested at context creation.
 Compatibility contexts may report a lower version depending on each driver.
 Mesa 26.0.2 implements the Vulkan 1.4 API, but the version reported by
 the apiVersion property of the VkPhysicalDeviceProperties struct
 depends on the particular driver being used.
 SHA checksums
 -------------
 ::
    SHA256: 973f535221be211c6363842b4cce9ef8e9b3e1d5ea86c5450ca86060163c7346  mesa-26.0.2.tar.xz
    SHA512: 0a7b9fc9b09e40345cc22d246dc1656900d74754c093882f6a39623af17fddc5f4a0c7938207c784ccf7306c5ed497be6a02c36f95c6548e01a2faa085e04c35  mesa-26.0.2.tar.xz
 New features
 ------------
 - None
 Bug fixes
 ---------
 - 26.0.1 fails to build: \`create_context.c: error: 'struct glx_screen' has no member named 'frontend_screen'`
 - A770: Counter-Strike 2 visual glitches (regression)
 - Bisected regression: Assertion texObj->pt == view->texture failed.
 - Kodi regression with panthor >= 1.7 after updating to Linux 7.0-rc1
 - MDK2 HD (opengl) has most elements rendered as black
 - Mesa 25.3 amdgpu memory issue
 - OpenGL 4.1 VRAM Memory Leak with setting uniform variables
 - Panfrost Bifrost compiler assertion failure: wrong vectorization in bi_alu_src_index (Mesa 26.0.0)
 - RADV: RDNA4 visual corruption in DX11 (DXVK) – Mafia III character model glitches, AMDVLK renders correctly (9070XT)
 - [radeonsi] Regression: GL_FEEDBACK returns 0.0 for X-coordinates (Legacy GL)
 - glsl: spec\@glsl-es-1.00\@linker\@glsl-mismatched-uniform-precision-unused broken
 - ir3: ir3_get_predicate() vs &ctx->build
 - r300 , regression , bisected : Glitches with Sauerbraten
 - r300: HiZ related dEQP failures
 Changes
 -------
 Anna Maniscalco (1):
 - zink: don't care about generated gs output primitive
 Benjamin Cheng (1):
 - radeonsi/vcn: Use full pitch for pre-encode input
 Boris Brezillon (1):
 - pan/kmod: Allow mmap() on foreign buffers
 Caio Oliveira (4):
 - spirv: Refactor ALU opcode translation to take bit sizes
 - spirv: Pull constant source fixup to the existing loop
 - spirv: Fix spec constant to handle Select for non-native floats
 - nir: Fix constant folding for iadd_sat
 Christoph Pillmayer (2):
 - pan/bi: Fix coupling spill placement
 - pan/bi: Move FAUs to memory for memory phis
 Connor Abbott (4):
 - tu: Use HW offset 0 in 3d loads/clears with FDM
 - ir3: Fix constlen trimming when more than one stage is trimmed
 - tu: Set polygon mode when blitting
 - tu: Fix setting will_be_resolved with MSRTSS
 Danylo Piliaiev (2):
 - tu: Store gmem attachments after custom resolve in dyn RP
 - tu: Don't read .patch_input_gmem of unused attachment
 David Rosca (1):
 - vl: Also disable MPEG2 Main profile when mpeg12 decode is disabled
 Eric Engestrom (3):
 - docs: add sha sum for 26.0.1
 - fixup! docs: add release notes for 26.0.1
 - .pick_status.json: Update to 73dba1e15173ff6109925de9615f9d9f5cccc2be
 Eric R. Smith (1):
 - pco: fix a typo in the check for optimization looping
 Erik Faye-Lund (1):
 - gallium/dri: set LIBVA_DRIVERS_PATH in devenv
 Faith Ekstrand (3):
 - etnaviv: Call lower_bool_to_int32 not to_bitsize
 - nir/lower_bool_to_bitsize: Make all bN_csel sources match
 - pan/bi: Be more careful about bit sizes in b2f lowering
 Georg Lehmann (3):
 - ci: disable debian-ppc64el and debian-s390x
 - aco/insert_fp_mode: don't skip setting round for fract
 - nir/opt_algebraic: fix frsq clamp pattern
 Ian Romanick (5):
 - brw: Don't mark_invalid in update_for_reads for non-VGRF destination
 - brw: Use brw_reg_is_arf in update_for_reads
 - brw: Also check for ADDRESS file in update_for_reads
 - brw/algebraic: Don't optimize SEL.L.SAT or SEL.G.SAT
 - elk/algebraic: Don't optimize SEL.L.SAT or SEL.G.SAT
 Icenowy Zheng (1):
 - pvr: only specially handle gfx subcmd for BeginQuery
 Iván Briano (1):
 - anv: don't try to fast clear D/S with multiview
 Jesse Natalie (1):
 - d3d12: Fix importing external resources
 Job Noorman (2):
 - ir3: update context builder after ir3_get_predicate
 - ir3: don't predicate vote_all/vote_any
 Jose Maria Casanova Crespo (3):
 - v3d: flush write jobs before BO replacement in DISCARD_WHOLE path
 - vc4: flush write jobs before BO replacement in DISCARD_WHOLE path
 - v3d: reject fast TLB blit when RT formats don't match
 Karol Herbst (2):
 - nir: fix nir_alu_type_range_contains_type_range for fp16 to int
 - nir: fix nir_round_int_to_float for fp16
 Lionel Landwerlin (2):
 - anv: add missing handling for attachment locations in secondaries
 - anv: dirty all push constant stages in simple shader
 Lucas Fryzek (5):
 - drisw: Properly mark shmid as -1 when alloc fails
 - x11: Add helper util to check for xshm support
 - egl/dri: Check that xshm can be attached
 - glx: Check that xshm can be attached
 - vulkan/wsi: Check that xshm can be attached
 Luigi Santivetti (1):
 - zink: fix format conversion logic for the alpha emulation case
 Marek Olšák (1):
 - ac: set the correct number of Z planes for ALLOW_EXPCLEAR
 Mary Guillemard (1):
 - vulkan: Do not override the shader_flags in case of no task shader
 Mel Henning (1):
 - driconf: force_vk_vendor on No Man's Sky + NVK
 Mike Blumenkrantz (4):
 - zink: add TRANSFER_WRITE -> HOST_READ sync to end of batch
 - st/bitmap: only release YUV samplerviews
 - radv: fix multiview fast clears
 - egl/device: fix the fix for explicit sw rejection in non-sw EGL_PLATFORM=device
 Patrick Lerda (1):
 - r600: fix cs atomic operations when the shader is called multiple times
 Pavel Ondračka (3):
 - r300: copy target when merging alpha output instruction
 - r300: disable HiZ for PIPE_FUNC_ALWAYS
 - r300: disable clip-discard watermark for triangles
 Pierre-Eric Pelloux-Prayer (2):
 - frontends/va: fix undefined ref error
 - mesa: don't wraparound st_context::work_counter
 Rhys Perry (2):
 - aco: perform dce for blocks skipped for process_block()
 - nir/range_analysis: set deleted key
 Sagar Ghuge (1):
 - anv: Fix Wa_14021821874, Wa_14018813551, Wa_14026600921
 Samuel Pitoiset (4):
 - radv: fix copying images with different swizzle modes on SDMA7
 - radv: fix a GPU hang with PS epilogs and secondary command buffers
 - radv: fix local invocation index for mesh/task and quad derivatives on GFX12
 - radv: fix missing L2 cache invalidation with streamout on GFX12
 Tapani Pälli (2):
 - intel/dev: update mesa_defs.json from workaround database
 - anv: add handling for Wa_14026600921
 Timothy Arceri (5):
 - glsl: relax precision matching on unused uniforms ES
 - glsl: add workaround for MDK2 HD
 - mesa/st: use same path for setting state ref locations
 - st/glsl_to_nir: update state var locations earlier
 - glx: guard glx_screen frontend_screen member
 Yiwei Zhang (2):
 - pan: fix to not clear out of bitset range
 - lvp: avoid advertising dmabuf support for kms_swrast
--- a/docs/relnotes/26.0.3.rst
+++ b/docs/relnotes/26.0.3.rst
@ -0,0 +1,113 @@
 Mesa 26.0.3 Release Notes / 2026-03-18
 ======================================
 Mesa 26.0.3 is a bug fix release which fixes bugs found since the 26.0.2 release.
 Mesa 26.0.3 implements the OpenGL 4.6 API, but the version reported by
 glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
 glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
 Some drivers don't support all the features required in OpenGL 4.6. OpenGL
 4.6 is **only** available if requested at context creation.
 Compatibility contexts may report a lower version depending on each driver.
 Mesa 26.0.3 implements the Vulkan 1.4 API, but the version reported by
 the apiVersion property of the VkPhysicalDeviceProperties struct
 depends on the particular driver being used.
 SHA checksums
 -------------
 ::
    SHA256: ddb7443d328e89aa45b4b6b80f077bf937f099daeca8ba48cabe32aab769e134  mesa-26.0.3.tar.xz
    SHA512: 82a33d0fa0c2855a63f599e38753126a2195025a13e45f38e14fda7aa008cb05925bb74088e4a1e199c9237d9388f4d4408a2c95c1d7fe79d8e6e6f27c84187b  mesa-26.0.3.tar.xz
 New features
 ------------
 - None
 Bug fixes
 ---------
 - Portal hard locks the machine on rv350.
 - Turnip crash with lazy depth textures: GPUMEM_BIND_RANGES failed (Not a typewriter)
 - [regression] Left 4 Dead 2 crashing when joining or starting survival with "Official Dedicated" servers
 - lavapipe: crash in caselist
 - zink: mesh shaders broken
 Changes
 -------
 Connor Abbott (2):
 - vtn: Fix vtn_mediump_downconvert_value() for transposed matrices
 - vtn: Fix vtn_mediump_upconvert_value() with transposed matrices
 Danylo Piliaiev (1):
 - tu/kgsl: Better detection of sparse support
 David Rosca (2):
 - radv/video: Fix AV1 encode min tile size
 - radv/video: Fix coding pic_parameter_set_id in H264 slice header
 Eric Engestrom (3):
 - docs: add sha sum for 26.0.2
 - .pick_status.json: Update to 70a487adfb42e3f9ed3b182a37133aed991fcf63
 - .pick_status.json: Mark f2f792996dffd97092f18961b44d71b568cd8551 as denominated
 Faith Ekstrand (1):
 - pan/compiler: Handle store_per_view_output in collect_varyings()
 Ian Douglas Scott (1):
 - wsi/wayland: Use \`wl_fixes` to destroy \`wl_registry`
 Mary Guillemard (1):
 - nvk/mme: Add missing nullcheck in nvk_mme_test_state_state
 Mike Blumenkrantz (13):
 - zink: reapply zsbuf state after unordered blits
 - zink: allow renderpass termination for clears with ZINK_DEBUG=rp and GENERAL layouts
 - zink: run opt_combine_stores when optimizing
 - nir: fix nir_is_io_compact for mesh shaders
 - mesa/st: fix unlower_io_to_vars to work with mesh shaders
 - zink: work around drivers with broken mesh shader properties
 - llvmpipe: save mesh shader when calling u_blitter
 - lavapipe: fix mesh property exports
 - mesa/st: make st_texture_get_current_sampler_view static
 - mesa/st/sampler_view: use a local variable for buffer sv format
 - mesa/st/sampler_view: use a local variable for texture sv format
 - mesa/st/sampler_view: eliminate st_sampler_view::srgb_skip_decode
 - mesa/st/samplerview: explicitly block releasing in-use samplerviews
 Natalie Vock (2):
 - radv/rt: Bump ray query stack base limit for GFX12
 - radv/rt: Fix shared ray query stack on top of application LDS
 Pavel Ondračka (1):
 - r300: pad short vertex shaders to avoid R3xx hangs
 Rob Clark (2):
 - freedreno/fdl: Use 4k alignment for tiled
 - freedreno/drm: Fix bo_flush race
 Ryan Zhang (1):
 - panvk/csf: use DEFERRED_FLUSH for fragment job cache flush
 Yiwei Zhang (1):
 - venus: force prime blit on Nvidia GPU
--- a/docs/relnotes/26.0.4.rst
+++ b/docs/relnotes/26.0.4.rst
@ -0,0 +1,273 @@
 Mesa 26.0.4 Release Notes / 2026-04-01
 ======================================
 Mesa 26.0.4 is a bug fix release which fixes bugs found since the 26.0.3 release.
 Mesa 26.0.4 implements the OpenGL 4.6 API, but the version reported by
 glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
 glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
 Some drivers don't support all the features required in OpenGL 4.6. OpenGL
 4.6 is **only** available if requested at context creation.
 Compatibility contexts may report a lower version depending on each driver.
 Mesa 26.0.4 implements the Vulkan 1.4 API, but the version reported by
 the apiVersion property of the VkPhysicalDeviceProperties struct
 depends on the particular driver being used.
 SHA checksums
 -------------
 ::
    SHA256: 6d91541e086f29bb003602d2c81070f2be4c0693a90b181ca91e46fa3953fe78  mesa-26.0.4.tar.xz
    SHA512: ddb59df633116a7ccd9d2d3a2e2009945909e3f774956efcbc032a2f963641cce50d0f319bebdc041df17700aa827aa2ccbc61c9e40b4020de9ff027eab27e23  mesa-26.0.4.tar.xz
 New features
 ------------
 - None
 Bug fixes
 ---------
 - Accumulation of black squares with OpenGL applications at high resolutions (hiz-related)
 - RADV: Invalid hitAttributeEXT value when using function-call RT pipelines
 - Segmentation fault in gm200_validate_sample_locations with Firefox on GTX 1070 Ti (nouveau)
 - Vulkan CTS regression bisected to 5d2c17a5fdce ("vtn: skip make-available/visible for shared")
 - [anv] Intel ARC B390 | Horizon Forbidden West | DX12 | Flashing effects
 - [radeonsi] Missing ground texture in Lethis Path of the Progress
 - amdgpu reset/crash when simulating stereo camera
 - building mesa_clc on ubutu-26.04 with gcc-16 fails link
 - util: Build regression with MSYS2 MinGW-W64 x64 clang 21 on 26.0.0-rc3
 - wsi: \`assert(chain->dxgi);` may failed under venus for win32
 Changes
 -------
 Adam Simpkins (1):
 - iris: fix a crash in disable_rb_aux_buffer
 Alyssa Milburn (1):
 - nv50,nvc0: Avoid uninitialized cbuf reads in blits
 Alyssa Rosenzweig (1):
 - nir: add nir_get_io_data_src
 Dave Airlie (1):
 - st/mesh: handle mesh shader point size
 David Rosca (2):
 - frontends/va: Fix leaking H264/5 PPS/SPS objects when decoder wasn't created
 - frontends/va: Fix leaks when create_video_codec fails
 Eric Engestrom (9):
 - docs: add sha sum for 26.0.3
 - .pick_status.json: Update to 48c086cb4203d1a8e7458e0d0a85cfffc5b4bfe5
 - .pick_status.json: Mark 26b19e355fefcd6a8325924e6a391dd67a675c34 as denominated
 - .pick_status.json: Mark 32a818d11d3d60ebbc23a62127e988d17e742b79 as denominated
 - .pick_status.json: Mark d38916d673e6d2359e96fed45ebd83ca026dfcb5 as denominated
 - .pick_status.json: Mark 384d12816459996fbac5722e9fdb29527662cafb as denominated
 - ci: changing .gitlab-ci.yml itself also means the container jobs must exist
 - .pick_status.json: Mark 538c3ee6c7a419d5c55bef2294ca10166f8d9af4 as denominated
 - [26.0 only] venus/ci: mark a test as fixed
 Eric Guo (1):
 - panfrost: Fix NULL pointer dereference in panfrost_emit_images
 Eric R. Smith (2):
 - panfrost: fix texel buffer calculations
 - panfrost: fix typos in architecture detection
 Erik Faye-Lund (5):
 - pan/genxml: remove non-existent YUV Enable for AFRC
 - pan/lib: do not try to use stencil-aspect of color attachment
 - pan/lib: set srgb-flag for afrc render-targets
 - pan/lib: divide extent by tile-extend, not itself
 - panvk: remove unused flag
 Faith Ekstrand (4):
 - nak: Report progress from nak_nir_rematerialize_load_const()
 - nir: Consider if uses in nir_def_all_uses_*
 - pan/bi: v2x16 conversions don't replicate
 - pan/buffer: Add the offset to the size for buffer textures
 Georg Lehmann (2):
 - gallivm: don't optimize fadd(a, 0.0) with signed zero preserve
 - nir/lower_non_uniform_access: fix fusing loops for same index but different array variable
 Hyunjun Ko (1):
 - anv: Add dummy workload for AV1 decode on affected platforms (Wa_1508208842)
 Ian Romanick (2):
 - brw/algebraic: Allow mixed types in saturate constant folding
 - brw: Handle scalars and swizzles correctly in is_const_zero
 Icenowy Zheng (8):
 - vulkan/wsi/headless: properly use CPU images for CPU devices
 - pco: fix encoding of fred's s0abs bit
 - pvr: Align width for PBE write when creating linear image
 - pvr: fix "obb" typo in oob_buffer_size when building vertex pds data
 - pvr: save vertex attribute size for DMA checking
 - pvr: move PVR_BUFFER_MEMORY_PADDING_SIZE definition to pvr_buffer.h
 - pvr: consider the size of DMA request when setting msize of DDMADT
 - pvr: fix dirty tracking for stencil ops
 Iván Briano (2):
 - anv: fix anv_is_dual_src_blend_equation
 - brw: do not omit RT writes if dual_src_blend is on
 Job Noorman (1):
 - ir3/legalize: don't drop sync flags on removed predt/predf
 Jose Maria Casanova Crespo (1):
 - broadcom/common: fix V3D 7.1 TFU ICFG IFORMAT values
 Juan A. Suarez Romero (1):
 - vc4: fix unwanted buffer release on uploader
 Lionel Landwerlin (3):
 - anv: add an analysis pass to detect compute shaders clearing data
 - anv: add drirc option to workaround missing application barriers on typed/untyped data
 - brw: fence SLM writes between workgroups
 Liviu Prodea (2):
 - clc: Fix static link with clang>=22
 - util: Fix use of undeclared identifier 'NULL' in src/util/os_misc.h when compiling with clang
 Luigi Santivetti (2):
 - pvr: expose partial usc mrt init routine
 - pvr: keep compiler resources in sync with attachments
 Marek Olšák (3):
 - radeonsi: recompute IO bases after optimizations
 - radeonsi: fix blits via util_blitter_draw_rectangle
 - radeonsi: disable streamout queries for u_blitter
 Mario Kleiner (1):
 - dri: Fix "cosmetic" undefined behaviour warning for RGB[A]16_UNORM formats.
 Mary Guillemard (5):
 - nvk: Move viewport and scissor emit to their own function
 - nvk: Broacast viewport0 and scissor0 in case of FSR on Turing
 - nir/dead_cf: Add missing load_ssbo_ir3 handling
 - nir/dead_cf: Add missing load_global_bounded handling
 - nak: Do not allow load_helper_invocation reordering
 Mike Blumenkrantz (3):
 - ntv: always emit const coord components for fbfetch loads
 - mesa/renderbuffer: always add PIPE_BIND_SAMPLER_VIEW to rendering textures
 - llvmpipe: fix color fbfetch
 Natalie Vock (1):
 - vulkan: Bump MAX_ENCODE_PASSES
 Nick Hamilton (1):
 - pvr: Fix for multiple attachments being assigned to the same tile buffer.
 Pavel Ondračka (5):
 - r300: fix bias presubtract algebraic transformation
 - r300: don't apply odd macroblock rounding to 3D textures
 - r300: disable zmask clears for large surfaces
 - r300: add shared HyperZ pipe-count helper
 - r300: split large HiZ clears into multiple packets
 Pierre-Eric Pelloux-Prayer (3):
 - radeonsi: move spi_shader_*_format to si_shader_variant_info
 - radeonsi: account for outputs_written when updating spi_shader_col_format
 - gallium/u_blitter: add a new fs_color_clear variant
 Radu Costas (1):
 - pco: Amend errant nir_move_option
 Rhys Perry (3):
 - aco/tests: fix assembler tests with LLVM 22
 - aco/tests: fix assembler/isel tests with LLVM 23
 - radv: fix memory leak in radv_rt_nir_to_asm
 Robert Mader (1):
 - llvmpipe: Stop aligning height to raster block size for unbacked handles
 Ryan Zhang (1):
 - panvk: trivial fix to remove repeated assignment
 Samuel Pitoiset (2):
 - radv/amdgpu: free the VA range in case the BO allocation failed
 - radv: emit BOP events after every draw to workaround a VRS bug on GFX12
 Simon Perretta (1):
 - pco: use vm/icm for tile buffer store coverage mask
 Timothy Arceri (2):
 - mesa: add force_explicit_uniform_loc_zero workaround
 - util/driconf: add workarounds for Lethis - Path Of Progress
 Valentine Burley (7):
 - tu/drm/virtio: Add missing lock to virtio_bo_init_dmabuf
 - tu/drm/virtio: Move set_iova into success path of virtio_bo_init_dmabuf
 - tu/drm/virtio: Avoid freeing zombified tu_sparse_vma
 - tu/drm/virtio: Do not free iova from heap for lazy BOs
 - tu/drm/virtio: Fix GEM handle leak in tu_bo_init error path
 - tu/drm/virtio: Fix GEM handle leak on failed dmabuf res_id lookup
 - ci: Drop duplicate Intel shader-db run
 Yiwei Zhang (3):
 - venus: fix to relax the KHR_external_memory_fd requirement
 - vulkan/wsi/win32: add wsi_win32_find_idle_image helper
 - vulkan/wsi/win32: respect acquire timeout for sw wsi
 emre (1):
 - nvk: fix barrier cache invalidation
 juntak0916 (1):
 - nvk: fix BindImageMemory2 per-bind status result
 kingstom.chen (1):
 - radv/rt: only run move_rt_instructions() for CPS shaders
 utzcoz (1):
 - gfxstream: Fix vkSetDebugUtilsObjectNameEXT crash for unwrapped objects
--- a/docs/relnotes/26.0.5.rst
+++ b/docs/relnotes/26.0.5.rst
@ -0,0 +1,177 @@
 Mesa 26.0.5 Release Notes / 2026-04-15
 ======================================
 Mesa 26.0.5 is a bug fix release which fixes bugs found since the 26.0.4 release.
 Mesa 26.0.5 implements the OpenGL 4.6 API, but the version reported by
 glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
 glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
 Some drivers don't support all the features required in OpenGL 4.6. OpenGL
 4.6 is **only** available if requested at context creation.
 Compatibility contexts may report a lower version depending on each driver.
 Mesa 26.0.5 implements the Vulkan 1.4 API, but the version reported by
 the apiVersion property of the VkPhysicalDeviceProperties struct
 depends on the particular driver being used.
 SHA checksums
 -------------
 ::
    TBD.
 New features
 ------------
 - None
 Bug fixes
 ---------
 - Is maxFragmentCombinedOutputResources=16 in Honeykrisp reflects an actual HW limit?
 - Mesa LLVMpipe Memory Leak
 Changes
 -------
 Ahmed Hesham (1):
 - rusticl: fix flag validation when creating an image
 Daniel Schürmann (1):
 - aco/lower_branches: Don't remove branches which jump over loops
 David Rosca (1):
 - radeonsi: Set multi plane format also for imported textures
 Eric Engestrom (4):
 - docs: add sha sum for 26.0.4
 - .pick_status.json: Update to 7e163fb79377c0fdf6d4e99ca4775fa7e1a4299e
 - .pick_status.json: Mark 9ff879441f91a8296891e2e13264a7a015a11a7d as denominated
 - .pick_status.json: Mark 4b3bd6b0b54d998a31356bf049911004683ea64f as denominated
 Eric Guo (1):
 - panfrost: disable round_to_nearest_even for NEAREST samplers
 Faith Ekstrand (6):
 - pan/bi: Support more swizzle aliases in the bifrost pack code
 - pan/bi: Delete a few instruction encodings
 - pan/bi/ra: Allow offsets on tied sources
 - pan/bi: Use bi_half() for texture MS indices
 - pan/bi: Add BI_SWIZZLE_NONE
 - pan/bi: Support all the swizzles in the packer
 Georg Lehmann (2):
 - nir/opt_load_skip_helpers: don't skip helpers for store_scratch data
 - aco/optimizer: do not try to create 3 byte constant operands
 Ian Romanick (2):
 - brw/const: Don't allow type changes when accumulators are involved
 - brw: brw_reg::nr for an accumulator is not part of the offset
 Icenowy Zheng (2):
 - pvr: fix pvr_clear_vdm_state_get_size_in_dw() inverted feature condition
 - pvr: set has_usc_alu_roundingmode_rne for all B-series Rogue cores
 Janne Grunau (1):
 - hk: Increase maxFragmentCombinedOutputResources to HK_MAX_DESCRIPTORS
 Job Noorman (4):
 - nir/opt_varyings: fix alu def cloning
 - nir/gather_info: clear interpolation qualifiers before gathering
 - ir3: fix handle_partial_const with vectorized src
 - nir/opt_uniform_subgroup: fix ballot_bit_count components
 Karol Herbst (4):
 - radeonsi: set valid_buffer_range for CL buffers
 - radeonsi: properly report unified memory on APUs
 - rusticl/kernel: implement CL_KERNEL_GLOBAL_WORK_SIZE for custom devices
 - rusticl/device: Fix reporting of global memory on mixed memory devices
 Konstantin Seurer (1):
 - radv/bvh: Prefer selecting quads as the first pair of a HW node
 Lionel Landwerlin (3):
 - anv: don't relocate memory from blob
 - brw: don't support frontfacing ternary optimization on != 32bit
 - elk: don't support frontfacing ternary optimization on != 32bit
 Marc Alcala Prieto (1):
 - pan/cs: Fix cs_run_fragment() calls with swapped arguments
 Mary Guillemard (2):
 - nvk: Adjust maxFragmentCombinedOutputResources to match max descriptors limit
 - hk: Add HK_MAX_RTS to maxFragmentCombinedOutputResources
 Mixie (1):
 - xlib: clear currentDpy when releasing the current context
 Natalie Vock (1):
 - radv/rt: Don't enable midpoint sorting
 Olivia Lee (1):
 - panfrost: don't try to emit varying shader stats on v12+
 Pavel Ondračka (2):
 - st/bitmap: release the temporary bitmap sampler view
 - gallium/u_blitter: remove unused CONST declaration when using IMM
 Rhys Perry (3):
 - util: fix UBSan error with _mesa_bfloat16_bits_to_float
 - ir3/array_to_ssa: skip remove_trivial_phi for non-array phis
 - ir3/ra: fix copy-paste error
 Samuel Pitoiset (3):
 - spirv: fix OpUntypedVariableKHR with optional data type parameter
 - radv/meta: fix computing extent for image->image with both compressed formats
 - vulkan: mark RP attachments as invalid when no rendering create info
 Timothy Arceri (1):
 - radeonsi: add Gun Godz workaround
 Valentine Burley (2):
 - zink/ci: Move zink-tu-a618 to sc7180-trogdor-kingoftown
 - ci/freedreno: Move remaining lazor a618 jobs, retire device type
 Vinson Lee (1):
 - d3d12: Fix MinGW cross-build error in resource_state_if_promoted
 Wujian Sun (1):
 - mesa: Fix inconsistent multisampled CopyTexImage checks
 Xianzhong Li (1):
 - panfrost: Fix GEM handle refcount leak in panfrost_bo_import
 Yuxuan Shui (1):
 - wsi/display: initialize Xlib display connector property IDs in all cases
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@ -1,32 +0,0 @@
 VK_KHR_relaxed_block_layout on pvr
 VK_KHR_storage_buffer_storage_class on pvr
 VK_EXT_external_memory_acquire_unmodified on panvk
 VK_EXT_discard_rectangles on NVK
 VK_KHR_present_id on HoneyKrisp
 VK_KHR_present_id2 on HoneyKrisp
 VK_KHR_present_wait on HoneyKrisp
 VK_KHR_present_wait2 on HoneyKrisp
 VK_KHR_maintenance10 on ANV, NVK, RADV
 VK_EXT_shader_uniform_buffer_unsized_array on ANV, HK, NVK, RADV
 VK_EXT_device_memory_report on panvk
 VK_VALVE_video_encode_rgb_conversion on radv
 VK_EXT_custom_resolve on RADV
 GL_EXT_shader_pixel_local_storage on Panfrost v6+
 VK_EXT_image_drm_format_modifier on panvk/v7
 VK_KHR_sampler_ycbcr_conversion on panvk/v7
 sparseResidencyImage2D on panvk v10+
 sparseResidencyStandard2DBlockShape on panvk v10+
 VK_KHR_surface_maintenance1 promotion everywhere EXT is exposed
 VK_KHR_swapchain_maintenance1 promotion everywhere EXT is exposed
 VK_KHR_dynamic_rendering on PowerVR
 VK_EXT_multisampled_render_to_single_sampled on panvk
 VK_KHR_pipeline_binary on HoneyKrisp
 VK_KHR_incremental_present on pvr
 VK_KHR_xcb_surface on pvr
 VK_KHR_xlib_surface on pvr
 VK_KHR_robustness2 on panvk v10+
 VK_KHR_robustness2 on HoneyKrisp
 VK_KHR_robustness2 on hasvk
 VK_KHR_robustness2 on NVK
 VK_KHR_robustness2 on Turnip
 VK_KHR_robustness2 on lavapipe
--- a/docs/submittingpatches.rst
+++ b/docs/submittingpatches.rst
@ -197,6 +197,9 @@ following example::
 This will backport the commit to the 21.0 branch, as well as any more recent
 stable branch. Multiple ``Backport-to:`` lines are allowed, but only the
 lowest number mentioned actually matters, so for clarity, please only use one.
 You can also use the special ``Backport-to: *`` which will nominate the commit
 to be backported to every active stable branch, making it a synonym to the ``Cc:
 mesa-stable`` below.
 The last option is deprecated and mostly here for historical reasons
 dating back to when patch submission was done via emails: using a ``Cc:``
--- a/meson.build
+++ b/meson.build
@ -642,7 +642,7 @@ if with_dri
 endif
 dep_dxheaders = null_dep
-if with_gallium_d3d12 or with_microsoft_clc or with_microsoft_vk or with_gfxstream_vk and host_machine.system() == 'windows'
+if with_gallium_d3d12 or with_microsoft_clc or with_microsoft_vk or (with_any_vk and host_machine.system() == 'windows')
  dep_dxheaders = dependency('directx-headers', required : false)
  if not dep_dxheaders.found()
    dep_dxheaders = dependency('DirectX-Headers',
@ -1931,7 +1931,6 @@ dep_spirv_tools = dependency(
  'SPIRV-Tools',
  required : with_spirv_tools,
  version : '>= 2024.1',
  static : host_machine.system() == 'darwin',
 )
 if dep_spirv_tools.found()
  pre_args += '-DHAVE_SPIRV_TOOLS'
@ -1959,6 +1958,9 @@ if with_clc
    if dep_llvm.version().version_compare('>= 18.0')
      clang_modules += 'clangAPINotes'
    endif
    if dep_llvm.version().version_compare('>= 22.0')
      clang_modules += ['clangAnalysisLifetimeSafety', 'clangOptions']
    endif
    dep_clang = []
    foreach m : clang_modules
--- a/src/amd/ci/radeonsi-mendocino-fails.txt
+++ b/src/amd/ci/radeonsi-mendocino-fails.txt
@ -401,7 +401,6 @@ spec@egl 1.4@eglterminate then unbind context,Fail
 spec@egl_khr_surfaceless_context@viewport,Fail
 spec@egl_mesa_configless_context@basic,Fail
 spec@ext_external_objects@vk-ping-pong-single-sem,Crash
 spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail
 spec@glsl-es-3.00@execution@built-in-functions@fs-packhalf2x16,Fail
 spec@glsl-es-3.00@execution@built-in-functions@vs-packhalf2x16,Fail
 spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
--- a/src/amd/ci/radeonsi-raven-fails.txt
+++ b/src/amd/ci/radeonsi-raven-fails.txt
@ -46,7 +46,6 @@ api@clgetdeviceinfo,Fail
 api@clgetextensionfunctionaddressforplatform,Fail
 api@clgetkernelarginfo,Fail
 api@cllinkprogram,Fail
 custom@r600 create release buffer bug,Fail
 program@build@vector-data-types,Fail
 program@execute@builtin@builtin-float-nextafter-1.0.generated,Fail
 program@execute@builtin@builtin-float-nextafter-1.0.generated@nextafter float1,Fail
@ -71,5 +70,4 @@ program@run kernel with max work item sizes,Fail
 # uprev Piglit in Mesa
 spec@ext_external_objects@vk-semaphores,Crash
 spec@ext_external_objects@vk-semaphores-2,Crash
 spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail
--- a/src/amd/ci/radeonsi-stoney-fails.txt
+++ b/src/amd/ci/radeonsi-stoney-fails.txt
@ -121,7 +121,6 @@ spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SR
 spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- swizzled- border color only,Fail
 spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip_adjacency ffs,Fail
 spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip_adjacency other,Fail
 spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail
 spec@glsl-es-3.00@execution@built-in-functions@fs-packhalf2x16,Fail
 spec@glsl-es-3.00@execution@built-in-functions@vs-packhalf2x16,Fail
 spec@khr_texture_compression_astc@miptree-gl srgb-fp,Fail
--- a/src/amd/ci/radeonsi-vangogh-fails.txt
+++ b/src/amd/ci/radeonsi-vangogh-fails.txt
@ -14,7 +14,6 @@ spec@egl_khr_surfaceless_context@viewport,Fail
 spec@ext_external_objects@vk-image-display,Crash
 spec@ext_external_objects@vk-semaphores,Crash
 spec@ext_external_objects@vk-semaphores-2,Crash
 spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail
 spec@glsl-es-3.00@execution@built-in-functions@fs-packhalf2x16,Fail
 spec@glsl-es-3.00@execution@built-in-functions@vs-packhalf2x16,Fail
 spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
--- a/src/amd/common/ac_cmdbuf_sdma.c
+++ b/src/amd/common/ac_cmdbuf_sdma.c
@ -222,10 +222,12 @@ static uint32_t
 ac_sdma_get_tiled_info_dword(const struct radeon_info *info,
                             const struct ac_sdma_surf_tiled *tiled)
 {
-   const uint32_t swizzle_mode = tiled->surf->has_stencil ? tiled->surf->u.gfx9.zs.stencil_swizzle_mode
+   const uint32_t swizzle_mode =
-                                                          : tiled->surf->u.gfx9.swizzle_mode;
+      tiled->is_stencil ? tiled->surf->u.gfx9.zs.stencil_swizzle_mode
-   const uint16_t epitch = tiled->surf->has_stencil ? tiled->surf->u.gfx9.zs.stencil_epitch
+                        : tiled->surf->u.gfx9.swizzle_mode;
-                                                    : tiled->surf->u.gfx9.epitch;
+   const uint16_t epitch =
      tiled->is_stencil ? tiled->surf->u.gfx9.zs.stencil_epitch
                        : tiled->surf->u.gfx9.epitch;
   const enum gfx9_resource_type dimension =
      ac_sdma_get_tiled_resource_dim(info->sdma_ip_version, tiled);
   const uint32_t mip_max = MAX2(tiled->num_levels, 1);
--- a/src/amd/common/ac_cmdbuf_sdma.h
+++ b/src/amd/common/ac_cmdbuf_sdma.h
@ -61,6 +61,7 @@ struct ac_sdma_surf_tiled {
   uint64_t va;
   enum pipe_format format;
   uint32_t bpp;
   bool is_stencil;
   struct {
      uint32_t x;
--- a/src/amd/common/ac_descriptors.c
+++ b/src/amd/common/ac_descriptors.c
@ -1055,8 +1055,15 @@ ac_init_ds_surface(const struct radeon_info *info, const struct ac_ds_state *sta
 static unsigned
 ac_get_decompress_on_z_planes(const struct radeon_info *info, enum pipe_format format, uint8_t log_num_samples,
-                              bool htile_stencil_disabled, bool no_d16_compression)
+                              bool tc_compat_htile_enabled, bool htile_stencil_disabled, bool no_d16_compression,
                              bool z_allow_expclear)
 {
   if (info->gfx_level < GFX8)
      return 0;
   if (!tc_compat_htile_enabled)
      return z_allow_expclear ? 15 : 0;
   uint32_t max_zplanes = 0;
   if (info->gfx_level >= GFX9) {
@ -1073,6 +1080,7 @@ ac_get_decompress_on_z_planes(const struct radeon_info *info, enum pipe_format f
         max_zplanes = 1;
      max_zplanes++;
      assert(max_zplanes != 1); /* 1 is invalid and can cause corruption on gfx11-11.5 */
   } else {
      if (format == PIPE_FORMAT_Z16_UNORM && no_d16_compression) {
         /* Do not enable Z plane compression for 16-bit depth
@ -1093,6 +1101,7 @@ ac_get_decompress_on_z_planes(const struct radeon_info *info, enum pipe_format f
      }
   }
   assert(max_zplanes != 10 && max_zplanes != 13); /* disallowed values */
   return max_zplanes;
 }
@ -1115,14 +1124,18 @@ ac_set_mutable_ds_surface_fields(const struct radeon_info *info, const struct ac
      log_num_samples = G_028040_NUM_SAMPLES(ds->db_z_info);
   }
   bool z_allow_expclear = info->gfx_level <= GFX11_5 &&
                           G_028038_ALLOW_EXPCLEAR(ds->db_z_info);
   const uint32_t max_zplanes =
      ac_get_decompress_on_z_planes(info, state->format, log_num_samples,
-                                    tile_stencil_disable, state->no_d16_compression);
+                                    state->tc_compat_htile_enabled, tile_stencil_disable,
                                    state->no_d16_compression, z_allow_expclear);
   if (info->gfx_level >= GFX9) {
-      if (state->tc_compat_htile_enabled) {
+      ds->db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes);
         ds->db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes);
      if (state->tc_compat_htile_enabled) {
         if (info->gfx_level >= GFX10) {
            const bool iterate256 = log_num_samples >= 1;
@ -1138,12 +1151,13 @@ ac_set_mutable_ds_surface_fields(const struct radeon_info *info, const struct ac
      ds->db_z_info |= S_028038_ZRANGE_PRECISION(state->zrange_precision);
   } else {
-      if (state->tc_compat_htile_enabled) {
+      if (info->gfx_level >= GFX8)
         ds->u.gfx6.db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
         ds->db_z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(max_zplanes);
-      } else {
+
      if (state->tc_compat_htile_enabled)
         ds->u.gfx6.db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
      else
         ds->u.gfx6.db_depth_info |= S_02803C_ADDR5_SWIZZLE_MASK(1);
      }
      ds->db_z_info |= S_028040_ZRANGE_PRECISION(state->zrange_precision);
   }
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -1096,6 +1096,13 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
                                 info->family == CHIP_NAVI22 ||
                                 info->family == CHIP_VANGOGH;
   /* GFX12 is affected by random GPU hangs when VRS rates are exported by the
    * last VGT stage under some conditions that are unclear. One possible
    * workaround is to emit BOP events after every draw that exports VRS
    * rates.
    */
   info->has_vrs_export_bug = info->gfx_level == GFX12;
   /* HW bug workaround when CS threadgroups > 256 threads and async compute
    * isn't used, i.e. only one compute job can run at a time.  If async
    * compute is possible, the threadgroup size must be limited to 256 threads
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@ -229,6 +229,7 @@ struct radeon_info {
   bool has_attr_ring_wait_bug;
   bool cp_dma_supports_sparse;
   bool has_vrs_ds_export_bug;
   bool has_vrs_export_bug;
   bool has_taskmesh_indirect0_bug;
   bool sdma_supports_sparse;      /* Whether SDMA can safely access sparse resources. */
   bool sdma_supports_compression; /* Whether SDMA supports DCC and HTILE. */
--- a/src/amd/common/ac_sqtt.c
+++ b/src/amd/common/ac_sqtt.c
@ -49,6 +49,8 @@ ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *da
 void
 ac_sqtt_init(struct ac_sqtt *data)
 {
   simple_mtx_init(&data->lock, mtx_plain);
   list_inithead(&data->rgp_pso_correlation.record);
   simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain);
@ -71,6 +73,8 @@ ac_sqtt_init(struct ac_sqtt *data)
 void
 ac_sqtt_finish(struct ac_sqtt *data)
 {
   simple_mtx_destroy(&data->lock);
   assert(data->rgp_pso_correlation.record_count == 0);
   simple_mtx_destroy(&data->rgp_pso_correlation.lock);
--- a/src/amd/common/ac_sqtt.h
+++ b/src/amd/common/ac_sqtt.h
@ -15,6 +15,7 @@
 #include "ac_pm4.h"
 #include "ac_rgp.h"
 #include "amd_family.h"
 #include "util/simple_mtx.h"
 #define SQTT_BUFFER_ALIGN_SHIFT 12
@ -61,6 +62,8 @@ struct ac_sqtt {
   struct rgp_clock_calibration rgp_clock_calibration;
   struct hash_table_u64 *pipeline_bos;
   simple_mtx_t lock;
 };
 struct ac_sqtt_data_info {
--- a/src/amd/common/nir/ac_nir_lower_ps_late.c
+++ b/src/amd/common/nir/ac_nir_lower_ps_late.c
@ -443,10 +443,14 @@ emit_ps_color_export(nir_builder *b, lower_ps_state *s, unsigned output_index, u
   }
   }
-   s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
+   nir_intrinsic_instr *exp = nir_export_amd(b, nir_vec(b, outputs, 4),
-                                         .base = target,
+                                             .base = target,
-                                         .write_mask = write_mask,
+                                             .flags = flags);
-                                         .flags = flags);
+
   /* Set the writemask explicitly because write_mask=0 means full write mask. */
   nir_intrinsic_set_write_mask(exp, write_mask);
   s->exp[s->exp_num++] = exp;
   return true;
 }
@ -483,7 +487,7 @@ emit_ps_dual_src_blend_swizzle(nir_builder *b, lower_ps_state *s, unsigned first
   uint32_t mrt0_write_mask = nir_intrinsic_write_mask(mrt0_exp);
   uint32_t mrt1_write_mask = nir_intrinsic_write_mask(mrt1_exp);
-   uint32_t write_mask = mrt0_write_mask & mrt1_write_mask;
+   uint32_t write_mask = mrt0_write_mask | mrt1_write_mask;
   nir_def *mrt0_arg = mrt0_exp->src[0].ssa;
   nir_def *mrt1_arg = mrt1_exp->src[0].ssa;
--- a/src/amd/compiler/README-ISA.md
+++ b/src/amd/compiler/README-ISA.md
@ -216,6 +216,11 @@ the correct layout is:
 VOP2 `v_pk_fmac_f16`. But like all other packed math opcodes, DPP does not function in practice.
 RDNA1 and RDNA2 support `v_pk_fmac_f16_dpp`.
 ## DPP with integer `subrev` and shifts
 No documentation mentions this, but DPP is seemingly applied to src1 instead of src0 for
 integer reverse subtract and shift opcodes.
 ## ds_swizzle_b32 rotate/fft modes
 These are first mentioned in the GFX9 (Vega) ISA doc, information from the LLVM bug tracker
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@ -1867,6 +1867,8 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
       ctx.vgpr_used_by_vmem_bvh.any()) {
      waitcnt_depctr &= 0xffe3;
      ctx.vgpr_used_by_vmem_load.reset();
      ctx.vgpr_used_by_vmem_sample.reset();
      ctx.vgpr_used_by_vmem_bvh.reset();
      ctx.vgpr_used_by_vmem_store.reset();
      ctx.vgpr_used_by_ds.reset();
   }
@ -1912,7 +1914,9 @@ handle_block(Program* program, Ctx& ctx, Block& block)
      Handle(state, ctx, instr, block.instructions);
      /* Resolve all possible hazards (we don't know what s_setpc_b64 jumps to). */
-      if (instr->opcode == aco_opcode::s_setpc_b64) {
+      if (instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
          instr->opcode == aco_opcode::s_call_b64) {
         found_end |= instr->opcode == aco_opcode::s_setpc_b64;
         block.instructions.emplace_back(std::move(instr));
         std::vector<aco_ptr<Instruction>> resolve_instrs;
@ -1920,8 +1924,6 @@ handle_block(Program* program, Ctx& ctx, Block& block)
         block.instructions.insert(std::prev(block.instructions.end()),
                                   std::move_iterator(resolve_instrs.begin()),
                                   std::move_iterator(resolve_instrs.end()));
         found_end = true;
         continue;
      }
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@ -484,10 +484,17 @@ process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instructio
         Operand exit_cond = Operand(exec, bld.lm);
         if (state == Exact) {
-            assert(info.exec.size() == 1);
+            bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc),
-            bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), info.exec[0].op,
+                     info.exec.back().op, src);
-                     src);
+            info.exec.back().op = Operand(exec, bld.lm);
-            info.exec[0].op = Operand(exec, bld.lm);
+
            /* Although this is in uniform CF, it might be a loop without back-edge.
             * Update the loop restore mask as well.
             */
            for (unsigned i = 0; i < info.exec.size() - 1; i++) {
               assert(info.exec[i + 1].type & mask_type_loop);
               info.exec[i].op = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm));
            }
         } else {
            Temp cond = bld.tmp(s1);
            info.exec[0].op = bld.sop2(Builder::s_andn2, bld.def(bld.lm), Definition(cond, scc),
--- a/src/amd/compiler/aco_insert_fp_mode.cpp
+++ b/src/amd/compiler/aco_insert_fp_mode.cpp
@ -233,9 +233,6 @@ instr_ignores_round_mode(const Instruction* instr)
   case aco_opcode::v_rndne_f64:
   case aco_opcode::v_rndne_f32:
   case aco_opcode::v_rndne_f16:
   case aco_opcode::v_fract_f64:
   case aco_opcode::v_fract_f32:
   case aco_opcode::v_fract_f16:
   case aco_opcode::s_min_f32:
   case aco_opcode::s_min_f16:
   case aco_opcode::s_max_f32:
@ -454,16 +451,16 @@ emit_set_mode_block(fp_mode_ctx* ctx, Block* block)
      for (uint32_t pred : block->linear_preds)
         max_pred = MAX2(max_pred, pred);
-      assert(max_pred != 0);
+      if (max_pred >= block->index) {
-
+         mode_mask to_set = 0;
-      mode_mask to_set = 0;
+         /* Check if the any mode was changed during the loop. */
-      /* Check if the any mode was changed during the loop. */
+         u_foreach_bit (i, fp_state.required) {
-      u_foreach_bit (i, fp_state.required) {
+            if (ctx->last_set[i] <= max_pred)
-         if (ctx->last_set[i] <= max_pred)
+               to_set |= BITFIELD_BIT(i);
-            to_set |= BITFIELD_BIT(i);
+         }
         if (to_set)
            set_mode(ctx, block, fp_state, 0, to_set);
      }
      if (to_set)
         set_mode(ctx, block, fp_state, 0, to_set);
   }
   ctx->block_states[block->index] = fp_state;
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@ -391,6 +391,65 @@ convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
   return tmp;
 }
 bool
 opcode_supports_dpp(amd_gfx_level gfx_level, aco_opcode opcode, bool vop3p)
 {
   switch (opcode) {
   /* reverse integer subtract and shift seem to apply dpp to src1 instead of src0 */
   case aco_opcode::v_subrev_co_u32:
   case aco_opcode::v_subrev_co_u32_e64:
   case aco_opcode::v_subbrev_co_u32:
   case aco_opcode::v_subrev_u16:
   case aco_opcode::v_subrev_u32:
   case aco_opcode::v_ashrrev_i32:
   case aco_opcode::v_lshrrev_b32:
   case aco_opcode::v_lshlrev_b32:
   case aco_opcode::v_ashrrev_i16:
   case aco_opcode::v_lshrrev_b16:
   case aco_opcode::v_lshlrev_b16:
   case aco_opcode::v_ashrrev_i16_e64:
   case aco_opcode::v_lshrrev_b16_e64:
   case aco_opcode::v_lshlrev_b16_e64: return false;
   case aco_opcode::v_pk_fmac_f16: return gfx_level < GFX11;
   /* there are more cases but those all take 64-bit inputs */
   case aco_opcode::v_madmk_f32:
   case aco_opcode::v_madak_f32:
   case aco_opcode::v_madmk_f16:
   case aco_opcode::v_madak_f16:
   case aco_opcode::v_fmamk_f32:
   case aco_opcode::v_fmaak_f32:
   case aco_opcode::v_fmamk_f16:
   case aco_opcode::v_fmaak_f16:
   case aco_opcode::v_readfirstlane_b32:
   case aco_opcode::v_cvt_f64_i32:
   case aco_opcode::v_cvt_f64_f32:
   case aco_opcode::v_cvt_f64_u32:
   case aco_opcode::v_mul_lo_u32:
   case aco_opcode::v_mul_lo_i32:
   case aco_opcode::v_mul_hi_u32:
   case aco_opcode::v_mul_hi_i32:
   case aco_opcode::v_qsad_pk_u16_u8:
   case aco_opcode::v_mqsad_pk_u16_u8:
   case aco_opcode::v_mqsad_u32_u8:
   case aco_opcode::v_mad_u64_u32:
   case aco_opcode::v_mad_i64_i32:
   case aco_opcode::v_permlane16_b32:
   case aco_opcode::v_permlanex16_b32:
   case aco_opcode::v_permlane64_b32:
   case aco_opcode::v_readlane_b32_e64:
   case aco_opcode::v_writelane_b32_e64: return false;
   /* simpler than listing all VOP3P opcodes which do not support DPP */
   case aco_opcode::v_fma_mix_f32:
   case aco_opcode::v_fma_mixlo_f16:
   case aco_opcode::v_fma_mixhi_f16:
   case aco_opcode::p_v_fma_mixlo_f16_rtz:
   case aco_opcode::p_v_fma_mixhi_f16_rtz:
   case aco_opcode::v_dot2_f32_f16:
   case aco_opcode::v_dot2_f32_bf16: return gfx_level >= GFX11;
   default: return !vop3p;
   }
 }
 bool
 can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
 {
@ -433,41 +492,7 @@ can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp
   if (instr->writes_exec())
      return false;
-   /* simpler than listing all VOP3P opcodes which do not support DPP */
+   return opcode_supports_dpp(gfx_level, instr->opcode, instr->isVOP3P());
   if (instr->isVOP3P()) {
      return instr->opcode == aco_opcode::v_fma_mix_f32 ||
             instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
             instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
             instr->opcode == aco_opcode::p_v_fma_mixlo_f16_rtz ||
             instr->opcode == aco_opcode::p_v_fma_mixhi_f16_rtz ||
             instr->opcode == aco_opcode::v_dot2_f32_f16 ||
             instr->opcode == aco_opcode::v_dot2_f32_bf16;
   }
   if (instr->opcode == aco_opcode::v_pk_fmac_f16)
      return gfx_level < GFX11;
   /* there are more cases but those all take 64-bit inputs */
   return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
          instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
          instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
          instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
          instr->opcode != aco_opcode::v_readfirstlane_b32 &&
          instr->opcode != aco_opcode::v_cvt_f64_i32 &&
          instr->opcode != aco_opcode::v_cvt_f64_f32 &&
          instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
          instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
          instr->opcode != aco_opcode::v_mul_hi_i32 &&
          instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
          instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
          instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
          instr->opcode != aco_opcode::v_mad_u64_u32 &&
          instr->opcode != aco_opcode::v_mad_i64_i32 &&
          instr->opcode != aco_opcode::v_permlane16_b32 &&
          instr->opcode != aco_opcode::v_permlanex16_b32 &&
          instr->opcode != aco_opcode::v_permlane64_b32 &&
          instr->opcode != aco_opcode::v_readlane_b32_e64 &&
          instr->opcode != aco_opcode::v_writelane_b32_e64;
 }
 aco_ptr<Instruction>
@ -889,7 +914,9 @@ needs_exec_mask(const Instruction* instr)
   if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
      return instr->opcode == aco_opcode::s_cbranch_execz ||
             instr->opcode == aco_opcode::s_cbranch_execnz ||
-             instr->opcode == aco_opcode::s_setpc_b64 || instr->reads_exec();
+             instr->opcode == aco_opcode::s_setpc_b64 ||
             instr->opcode == aco_opcode::s_swappc_b64 || instr->opcode == aco_opcode::s_call_b64 ||
             instr->reads_exec();
   if (instr->isPseudo()) {
      switch (instr->opcode) {
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -2040,6 +2040,8 @@ bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
 uint8_t get_gfx11_true16_mask(aco_opcode op);
 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
 bool opcode_supports_dpp(amd_gfx_level gfx_level, aco_opcode opcode, bool vop3p);
 bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
 bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
 bool can_write_m0(const aco_ptr<Instruction>& instr);
 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
--- a/src/amd/compiler/aco_lower_branches.cpp
+++ b/src/amd/compiler/aco_lower_branches.cpp
@ -298,7 +298,9 @@ eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block)
      /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
      if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
-          instr->opcode == aco_opcode::s_cbranch_scc1) {
+          instr->opcode == aco_opcode::s_cbranch_scc1 ||
          instr->opcode == aco_opcode::s_cbranch_vccz ||
          instr->opcode == aco_opcode::s_cbranch_vccnz) {
         exec_write_used |= ctx.blocks_incoming_exec_used[instr->salu().imm];
      }
@ -377,6 +379,10 @@ can_remove_branch(branch_ctx& ctx, Block& block, Pseudo_branch_instruction* bran
      if (uniform_branch && !ctx.program->blocks[i].instructions.empty())
         return false;
      /* Don't enter loops with empty exec mask. */
      if (ctx.program->blocks[i].loop_nest_depth > block.loop_nest_depth)
         return false;
      for (aco_ptr<Instruction>& instr : ctx.program->blocks[i].instructions) {
         if (instr->isSOPP()) {
            /* Discard early exits and loop breaks and continues should work fine with
--- a/src/amd/compiler/aco_nir_call_attribs.h
+++ b/src/amd/compiler/aco_nir_call_attribs.h
@ -22,7 +22,13 @@ enum aco_nir_function_attribs {
 };
 enum aco_nir_parameter_attribs {
-   /* Parameter value is not used by any callee and does not need to be preserved */
+   /* This parameter's value may not be preserved across a callee. Unlike return parameters, the
    * parameter's value is undefined on return. Callers must back up values of discardable
    * parameters separately.
    * Mostly used for tail calls, where parameters to the tail callee have different values than
    * for the caller. In that case, on function return, the parameters will have been overwritten
    * with the tail callee parameter values.
    */
   ACO_NIR_PARAM_ATTRIB_DISCARDABLE = 0x1,
 };
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@ -427,6 +427,21 @@ process_block(vn_ctx& ctx, Block& block)
   block.instructions = std::move(new_instructions);
 }
 void
 dce_instructions(vn_ctx& ctx, Block& block)
 {
   std::vector<aco_ptr<Instruction>> new_instructions;
   new_instructions.reserve(block.instructions.size());
   for (aco_ptr<Instruction>& instr : block.instructions) {
      if (is_dead(ctx.uses, instr.get()))
         continue;
      new_instructions.emplace_back(std::move(instr));
   }
   block.instructions = std::move(new_instructions);
 }
 void
 rename_phi_operands(Block& block, aco::unordered_map<uint32_t, Temp>& renames)
 {
@ -467,10 +482,12 @@ value_numbering(Program* program)
      if (block.logical_idom == (int)block.index)
         ctx.expr_values.clear();
-      if (block.logical_idom != -1)
+      if (block.logical_idom != -1) {
         process_block(ctx, block);
-      else
+      } else {
         dce_instructions(ctx, block);
         rename_phi_operands(block, ctx.renames);
      }
      /* increment exec_id when entering nested control flow */
      if (block.kind & block_kind_branch || block.kind & block_kind_loop_preheader ||
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@ -1190,7 +1190,7 @@ alu_opt_gather_info(opt_ctx& ctx, Instruction* instr, alu_opt_info& info)
      info.operands.push_back({instr->operands[0]});
      if (instr->definitions[0].regClass() == s1) {
         info.defs.push_back(instr->definitions[1]);
-         info.opcode = aco_opcode::v_lshl_b32;
+         info.opcode = aco_opcode::s_lshl_b32;
         info.format = Format::SOP2;
         std::swap(info.operands[0], info.operands[1]);
      } else {
@ -1759,6 +1759,8 @@ pseudo_can_accept_constant(const aco_ptr<Instruction>& instr, unsigned operand)
   assert(instr->operands.size() > operand);
   if (instr->operands[operand].isFixed())
      return false;
   if (!util_is_power_of_two_nonzero(instr->operands[operand].bytes()))
      return false;
   switch (instr->opcode) {
   case aco_opcode::p_extract_vector:
@ -2810,7 +2812,8 @@ label_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
               instr->operands[0] = op;
               break;
            }
-         } else if (info.is_constant()) {
+         } else if (info.is_constant() &&
                    util_is_power_of_two_nonzero(instr->definitions[0].bytes())) {
            /* propagate constants */
            uint64_t mask = u_bit_consecutive64(0, instr->definitions[0].bytes() * 8u);
            uint64_t val = (info.val >> (dst_offset * 8u)) & mask;
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@ -142,6 +142,10 @@ save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
      ctx.instr_idx_by_regs[ctx.current_block->index][instr->pseudo().scratch_sgpr] =
         overwritten_unknown_instr;
   }
   if (instr->isCall()) {
      std::fill(ctx.instr_idx_by_regs[ctx.current_block->index].begin(),
                ctx.instr_idx_by_regs[ctx.current_block->index].end(), overwritten_unknown_instr);
   }
 }
 Idx
@ -862,6 +866,8 @@ instr_overwrites(Instruction* instr, PhysReg reg, unsigned size)
      if (scratch_reg >= reg && reg + size > scratch_reg)
         return true;
   }
   if (instr->isCall())
      return true;
   return false;
 }
--- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
@ -672,7 +672,7 @@ build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
 Instruction*
 add_startpgm(struct isel_context* ctx, bool is_callee)
 {
-   ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size;
+   ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size * ctx->program->wave_size;
   unsigned def_count = 0;
   for (unsigned i = 0; i < ctx->args->arg_count; i++) {
@ -1034,8 +1034,7 @@ find_param_regs(Program* program, const ABI& abi, callee_info& info,
            param_demand += Temp(0, it2->rc);
-            it2->dst_info->needs_explicit_preservation =
+            it2->dst_info->needs_explicit_preservation = regs == clobbered_regs;
               regs == clobbered_regs && !it2->dst_info->discardable;
            it2->dst_info->def.setPrecolored(*next_reg);
            for (unsigned i = 0; i < it2->rc.size(); ++i)
               BITSET_CLEAR(regs, next_reg->reg() + i);
@ -1051,8 +1050,7 @@ find_param_regs(Program* program, const ABI& abi, callee_info& info,
            next_reg = next_reg->advance(required_padding * 4);
      }
      if (next_reg) {
-         params.back().dst_info->needs_explicit_preservation =
+         params.back().dst_info->needs_explicit_preservation = regs == clobbered_regs;
            regs == clobbered_regs && !params.back().dst_info->discardable;
         param_demand += Temp(0, params.back().rc);
         params.back().dst_info->def.setPrecolored(*next_reg);
         BITSET_CLEAR_COUNT(regs, next_reg->reg(), params.back().rc.size());
--- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
@ -3392,7 +3392,10 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
      offset = as_vgpr(ctx, offset);
      for (unsigned i = 0; i < write_count; i++) {
         aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
-         Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offsets.back(),
+         Operand soffset = Operand::c32(0);
         if (!ctx->program->scratch_offsets.empty())
            soffset = Operand(ctx->program->scratch_offsets.back());
         Instruction* mubuf = bld.mubuf(op, rsrc, offset, soffset,
                                        write_datas[i], offsets[i], true);
         mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
         enum ac_access_type type =
--- a/src/amd/compiler/tests/test_assembler.cpp
+++ b/src/amd/compiler/tests/test_assembler.cpp
@ -298,6 +298,10 @@ BEGIN_TEST(assembler.long_jump.constaddr)
   if (!setup_cs(NULL, (amd_gfx_level)GFX10))
      return;
   //! llvm_version: #llvm_ver
   fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
   //; funcs['lit'] = lambda v: 'lit(%s)' % hex(int(v)) if llvm_ver >= 22 else v
   //>> s_branch 16369                                              ; bf823ff1
   bld.sopp(aco_opcode::s_branch, 2);
@ -309,7 +313,7 @@ BEGIN_TEST(assembler.long_jump.constaddr)
   bld.reset(program->create_and_insert_block());
   //>> s_getpc_b64 s[0:1]                                          ; be801f00
-   //! s_add_u32 s0, s0, 32                                         ; 8000ff00 00000020
+   //! s_add_u32 s0, s0, @lit(32)                                   ; 8000ff00 00000020
   bld.sop1(aco_opcode::p_constaddr_getpc, Definition(PhysReg(0), s2), Operand::zero());
   bld.sop2(aco_opcode::p_constaddr_addlo, Definition(PhysReg(0), s1), bld.def(s1, scc),
            Operand(PhysReg(0), s1), Operand::zero(), Operand::zero());
@ -424,12 +428,16 @@ BEGIN_TEST(assembler.p_constaddr)
   dst0.setFixed(PhysReg(0));
   dst1.setFixed(PhysReg(2));
   //! llvm_version: #llvm_ver
   fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
   //; funcs['lit'] = lambda v: 'lit(%s)' % hex(int(v)) if llvm_ver >= 22 else v
   //>> s_getpc_b64 s[0:1] ; be801c00
-   //! s_add_u32 s0, s0, 44 ; 8000ff00 0000002c
+   //! s_add_u32 s0, s0, @lit(44) ; 8000ff00 0000002c
   bld.pseudo(aco_opcode::p_constaddr, dst0, bld.def(s1, scc), Operand::zero());
   //! s_getpc_b64 s[2:3] ; be821c00
-   //! s_add_u32 s2, s2, 64 ; 8002ff02 00000040
+   //! s_add_u32 s2, s2, @lit(64) ; 8002ff02 00000040
   bld.pseudo(aco_opcode::p_constaddr, dst1, bld.def(s1, scc), Operand::c32(32));
   aco::lower_to_hw_instr(program.get());
@ -1056,20 +1064,23 @@ BEGIN_TEST(assembler.exp)
      Operand op_m0(bld.tmp(s1));
      op_m0.setFixed(m0);
-      //~gfx11>> exp mrt3 v1, v0, v3, v2                                     ; f800003f 02030001
+      //! mrt3: @match_func(mrt3)
-      //~gfx12>> export mrt3 v1, v0, v3, v2                                  ; f800003f 02030001
+      fprintf(output, "mrt3: mrt3%s\n", LLVM_VERSION_MAJOR >= 23 ? "," : "");
      //~gfx11>> exp @mrt3 v1, v0, v3, v2                                   ; f800003f 02030001
      //~gfx12>> export @mrt3 v1, v0, v3, v2                                ; f800003f 02030001
      bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], 0xf, 3);
-      //~gfx11! exp mrt3 v1, off, v0, off                                   ; f8000035 80008001
+      //~gfx11! exp @mrt3 v1, off, v0, off                                  ; f8000035 80008001
-      //~gfx12! export mrt3 v1, off, v0, off                                ; f8000035 80008001
+      //~gfx12! export @mrt3 v1, off, v0, off                               ; f8000035 80008001
      bld.exp(aco_opcode::exp, op[1], Operand(v1), op[0], Operand(v1), 0x5, 3);
-      //~gfx11! exp mrt3 v1, v0, v3, v2 done                                ; f800083f 02030001
+      //~gfx11! exp @mrt3 v1, v0, v3, v2 done                               ; f800083f 02030001
-      //~gfx12! export mrt3 v1, v0, v3, v2 done                             ; f800083f 02030001
+      //~gfx12! export @mrt3 v1, v0, v3, v2 done                            ; f800083f 02030001
      bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], 0xf, 3, false, true);
-      //~gfx11! exp mrt3 v1, v0, v3, v2 row_en                              ; f800203f 02030001
+      //~gfx11! exp @mrt3 v1, v0, v3, v2 row_en                             ; f800203f 02030001
-      //~gfx12! export mrt3 v1, v0, v3, v2 row_en                           ; f800203f 02030001
+      //~gfx12! export @mrt3 v1, v0, v3, v2 row_en                          ; f800203f 02030001
      bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], op_m0, 0xf, 3)->exp().row_en = true;
      finish_assembler_test();
--- a/src/amd/compiler/tests/test_isel.cpp
+++ b/src/amd/compiler/tests/test_isel.cpp
@ -172,11 +172,14 @@ BEGIN_TEST(isel.discard_early_exit.mrtz)
      }
   );
   //! mrtz: @match_func(mrtz)
   fprintf(output, "mrtz: mrtz%s\n", LLVM_VERSION_MAJOR >= 23 ? "," : "");
   /* On GFX11, the discard early exit must use mrtz if the shader exports only depth. */
-   //>> exp mrtz v#_, off, off, off done    ; $_ $_
+   //>> exp @mrtz v#_, off, off, off done   ; $_ $_
   //! s_endpgm                             ; $_
   //! BB1:
-   //! exp mrtz off, off, off, off done     ; $_ $_
+   //! exp @mrtz off, off, off, off done    ; $_ $_
   //! s_endpgm                             ; $_
   PipelineBuilder pbld(get_vk_device(GFX11));
@ -197,11 +200,14 @@ BEGIN_TEST(isel.discard_early_exit.mrt0)
      }
   );
   //! mrt0: @match_func(mrt0)
   fprintf(output, "mrt0: mrt0%s\n", LLVM_VERSION_MAJOR >= 23 ? "," : "");
   /* On GFX11, the discard early exit must use mrt0 if the shader exports color. */
-   //>> exp mrt0 v#x, v#x, v#x, v#x done    ; $_ $_
+   //>> exp @mrt0 v#x, v#x, v#x, v#x done   ; $_ $_
   //! s_endpgm                             ; $_
   //! BB1:
-   //! exp mrt0 off, off, off, off done     ; $_ $_
+   //! exp @mrt0 off, off, off, off done    ; $_ $_
   //! s_endpgm                             ; $_
   PipelineBuilder pbld(get_vk_device(GFX11));
--- a/src/amd/vulkan/bvh/encode.comp
+++ b/src/amd/vulkan/bvh/encode.comp
@ -145,7 +145,7 @@ main()
                                             ir_id_to_offset(children[i]))).aabb;
            float surface_area = aabb_surface_area(bounds);
-            if (surface_area > largest_surface_area) {
+            if (surface_area > largest_surface_area || collapsed_child_index == -1) {
               largest_surface_area = surface_area;
               collapsed_child_index = i;
            }
--- a/src/amd/vulkan/bvh/encode_triangles_gfx12.comp
+++ b/src/amd/vulkan/bvh/encode_triangles_gfx12.comp
@ -328,9 +328,23 @@ main()
               vertex_used[i] = false;
         }
      } else {
-         uint32_t chosen_invocation =
+         uint32_t candidate_mask = radv_ballot(cluster, !assigned && required_bit_size == min_required_bit_size);
-            findMSB(radv_ballot(cluster, !assigned && required_bit_size == min_required_bit_size));
+
-         if (cluster.invocation_index != chosen_invocation && !assigned) {
+         /* Always choose a quad as the first node to make sure that a potential single triangle node will have the
          * highest hw_node_index.
          */
         if (assigned_mask == 0) {
            uint32_t quad_mask = radv_ballot(cluster, !assigned && pair_index_node_index1 != RADV_BVH_INVALID_NODE);
            if (quad_mask != 0) {
               uint32_t combined_mask = candidate_mask & quad_mask;
               if (combined_mask != 0)
                  candidate_mask = combined_mask;
               else
                  candidate_mask = quad_mask;
            }
         }
         if (cluster.invocation_index != findMSB(candidate_mask) && !assigned) {
            vertex_indices = UNASSIGNED_VERTEX_INDICES;
            for (uint32_t i = 0; i < 6; i++)
               vertex_used[i] = false;
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@ -778,8 +778,11 @@ sqtt_QueueSubmit2(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *pSu
   if (queue->sqtt_present)
      return radv_sqtt_wsi_submit(_queue, submitCount, pSubmits, _fence);
-   if (instance->vk.trace_per_submit)
+   if (instance->vk.trace_per_submit) {
      /* Make sure to lock in case of multithreaded submissions. */
      simple_mtx_lock(&device->sqtt.lock);
      radv_sqtt_start_capturing(queue);
   }
   for (uint32_t i = 0; i < submitCount; i++) {
      const VkSubmitInfo2 *pSubmit = &pSubmits[i];
@ -863,12 +866,17 @@ sqtt_QueueSubmit2(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *pSu
                 "radv: Failed to capture RGP for this submit because the buffer is too small and auto-resizing "
                 "is disabled. See RADV_THREAD_TRACE_BUFFER_SIZE for increasing the size.\n");
      }
      simple_mtx_unlock(&device->sqtt.lock);
   }
   return result;
 fail:
   FREE(new_cmdbufs);
   if (instance->vk.trace_per_submit) {
      simple_mtx_unlock(&device->sqtt.lock);
   }
   return result;
 }
--- a/src/amd/vulkan/layers/radv_strange_brigade.c
+++ b/src/amd/vulkan/layers/radv_strange_brigade.c
@ -0,0 +1,31 @@
 /*
 * Copyright © 2026 Valve Corporation
 *
 * SPDX-License-Identifier: MIT
 */
 #include "radv_cmd_buffer.h"
 #include "radv_device.h"
 #include "radv_entrypoints.h"
 VKAPI_ATTR void VKAPI_CALL
 strange_brigade_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo)
 {
   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
   for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
      VkImageMemoryBarrier2 *barrier = (VkImageMemoryBarrier2 *)&pDependencyInfo->pImageMemoryBarriers[i];
      if (barrier->newLayout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
          barrier->srcAccessMask == VK_ACCESS_COLOR_ATTACHMENT_READ_BIT) {
         /* This game has a broken barrier right before present that causes rendering issues. Fix it
          * by modifying the src access mask.
          */
         barrier->srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
         break;
      }
   }
   device->layer_dispatch.app.CmdPipelineBarrier2(commandBuffer, pDependencyInfo);
 }
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@ -22,6 +22,7 @@ radv_entrypoints_gen_command += [
  '--device-prefix', 'rage2',
  '--device-prefix', 'quantic_dream',
  '--device-prefix', 'no_mans_sky',
  '--device-prefix', 'strange_brigade',
  # Command buffer annotation layer entrypoints
  '--device-prefix', 'annotate',
@ -42,6 +43,7 @@ libradv_files = files(
  'layers/radv_rage2.c',
  'layers/radv_quantic_dream.c',
  'layers/radv_no_mans_sky.c',
  'layers/radv_strange_brigade.c',
  'layers/radv_rmv_layer.c',
  'layers/radv_rra_layer.c',
  'layers/radv_sqtt_layer.c',
--- a/src/amd/vulkan/meta/radv_meta.h
+++ b/src/amd/vulkan/meta/radv_meta.h
@ -97,6 +97,7 @@ enum radv_meta_object_key_type {
   RADV_META_OBJECT_KEY_CLEAR_HIZ,
   RADV_META_OBJECT_KEY_FAST_CLEAR_ELIMINATE,
   RADV_META_OBJECT_KEY_DCC_DECOMPRESS,
   RADV_META_OBJECT_KEY_DCC_DECOMPRESS_CS,
   RADV_META_OBJECT_KEY_DCC_RETILE,
   RADV_META_OBJECT_KEY_HTILE_EXPAND_GFX,
   RADV_META_OBJECT_KEY_HTILE_EXPAND_CS,
--- a/src/amd/vulkan/meta/radv_meta_clear.c
+++ b/src/amd/vulkan/meta/radv_meta_clear.c
@ -1475,7 +1475,8 @@ radv_can_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, const struct radv_
 static void
 radv_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
                      const VkClearAttachment *clear_att, const VkClearRect *clear_rect,
-                      enum radv_cmd_flush_bits *pre_flush, enum radv_cmd_flush_bits *post_flush)
+                      enum radv_cmd_flush_bits *pre_flush, enum radv_cmd_flush_bits *post_flush,
                      uint32_t view_mask)
 {
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
   const struct radv_physical_device *pdev = radv_device_physical(device);
@ -1488,7 +1489,8 @@ radv_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, const struct radv_imag
      .baseMipLevel = iview->vk.base_mip_level,
      .levelCount = iview->vk.level_count,
      .baseArrayLayer = iview->vk.base_array_layer + clear_rect->baseArrayLayer,
-      .layerCount = clear_rect->layerCount,
+      /* radv_can_fast_clear_color blocks multiview fast clears unless the viewmask contains all layers */
      .layerCount = view_mask ? iview->vk.layer_count : clear_rect->layerCount,
   };
   if (pre_flush) {
@ -1575,7 +1577,7 @@ emit_clear(struct radv_cmd_buffer *cmd_buffer, const VkClearAttachment *clear_at
      if (radv_can_fast_clear_color(cmd_buffer, color_att->iview, color_att->layout, clear_rect, clear_value,
                                    view_mask)) {
-         radv_fast_clear_color(cmd_buffer, color_att->iview, clear_att, clear_rect, pre_flush, post_flush);
+         radv_fast_clear_color(cmd_buffer, color_att->iview, clear_att, clear_rect, pre_flush, post_flush, view_mask);
      } else {
         emit_color_clear(cmd_buffer, clear_att, clear_rect, view_mask);
      }
@ -1877,7 +1879,7 @@ radv_fast_clear_range(struct radv_cmd_buffer *cmd_buffer, struct radv_image *ima
   if (vk_format_is_color(format)) {
      if (radv_can_fast_clear_color(cmd_buffer, &iview, image_layout, &clear_rect, clear_att.clearValue.color, 0)) {
-         radv_fast_clear_color(cmd_buffer, &iview, &clear_att, &clear_rect, NULL, NULL);
+         radv_fast_clear_color(cmd_buffer, &iview, &clear_att, &clear_rect, NULL, NULL, 0);
         fast_cleared = true;
      }
   } else {
--- a/src/amd/vulkan/meta/radv_meta_copy.c
+++ b/src/amd/vulkan/meta/radv_meta_copy.c
@ -144,6 +144,40 @@ gfx_or_compute_copy_memory_to_image(struct radv_cmd_buffer *cmd_buffer, uint64_t
                  (use_compute ? RADV_META_SAVE_COMPUTE_PIPELINE : RADV_META_SAVE_GRAPHICS_PIPELINE) |
                     RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS);
   if (use_compute) {
      /* For partial copies, HTILE is decompressed before because image stores don't write the
       * uncompressed DWORD to HTILE. And then it's needed to re-initialize HTILE to its
       * uncompressed state after the copy.
       */
      const bool is_partial_copy = region->imageOffset.x || region->imageOffset.y || region->imageOffset.z ||
                                   region->imageExtent.width != image->vk.extent.width ||
                                   region->imageExtent.height != image->vk.extent.height ||
                                   region->imageExtent.depth != image->vk.extent.depth;
      uint32_t queue_mask = radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf);
      if (radv_layout_is_htile_compressed(device, image, region->imageSubresource.mipLevel, layout, queue_mask) &&
          is_partial_copy) {
         radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_UNKNOWN_REASON);
         u_foreach_bit (i, region->imageSubresource.aspectMask) {
            unsigned aspect_mask = 1u << i;
            radv_expand_depth_stencil(
               cmd_buffer, image,
               &(VkImageSubresourceRange){
                  .aspectMask = aspect_mask,
                  .baseMipLevel = region->imageSubresource.mipLevel,
                  .levelCount = 1,
                  .baseArrayLayer = region->imageSubresource.baseArrayLayer,
                  .layerCount = vk_image_subresource_layer_count(&image->vk, &region->imageSubresource),
               },
               NULL);
         }
         radv_describe_barrier_end(cmd_buffer);
      }
   }
   /**
    * From the Vulkan 1.0.6 spec: 18.3 Copying Data Between Images
    *    extent is the size in texels of the source image to copy in width,
@ -222,6 +256,27 @@ gfx_or_compute_copy_memory_to_image(struct radv_cmd_buffer *cmd_buffer, uint64_t
         slice_array++;
   }
   if (use_compute) {
      /* Fixup HTILE after a copy on compute. */
      uint32_t queue_mask = radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf);
      if (radv_layout_is_htile_compressed(device, image, region->imageSubresource.mipLevel, layout, queue_mask)) {
         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE;
         VkImageSubresourceRange range = {
            .aspectMask = region->imageSubresource.aspectMask,
            .baseMipLevel = region->imageSubresource.mipLevel,
            .levelCount = 1,
            .baseArrayLayer = region->imageSubresource.baseArrayLayer,
            .layerCount = vk_image_subresource_layer_count(&image->vk, &region->imageSubresource),
         };
         uint32_t htile_value = radv_get_htile_initial_value(device, image);
         cmd_buffer->state.flush_bits |= radv_clear_htile(cmd_buffer, image, &range, htile_value, false);
      }
   }
   radv_meta_restore(&saved_state, cmd_buffer);
 }
@ -704,7 +759,14 @@ radv_CmdCopyImage2(VkCommandBuffer commandBuffer, const VkCopyImageInfo2 *pCopyI
      const enum util_format_layout format_layout = radv_format_description(dst_image->vk.format)->layout;
      for (unsigned r = 0; r < pCopyImageInfo->regionCount; r++) {
         VkExtent3D dst_extent = pCopyImageInfo->pRegions[r].extent;
-         if (src_image->vk.format != dst_image->vk.format) {
+
         /* The Vulken spec 1.4.347 says:
          *
          * "VUID-VkCopyImageInfo2-srcImage-09247
          *  If the VkFormat of each of srcImage and dstImage is a compressed image format, the
          *  formats must have the same texel block extent"
          */
         if (vk_format_is_compressed(src_image->vk.format) != vk_format_is_compressed(dst_image->vk.format)) {
            dst_extent.width = dst_extent.width / vk_format_get_blockwidth(src_image->vk.format) *
                               vk_format_get_blockwidth(dst_image->vk.format);
            dst_extent.height = dst_extent.height / vk_format_get_blockheight(src_image->vk.format) *
--- a/src/amd/vulkan/meta/radv_meta_fast_clear.c
+++ b/src/amd/vulkan/meta/radv_meta_fast_clear.c
@ -8,6 +8,7 @@
 #include <stdbool.h>
 #include "nir/radv_meta_nir.h"
 #include "radv_cs.h"
 #include "radv_meta.h"
 enum radv_color_op {
@ -19,7 +20,7 @@ enum radv_color_op {
 static VkResult
 get_dcc_decompress_compute_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_DCC_DECOMPRESS;
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_DCC_DECOMPRESS_CS;
   VkResult result;
   const VkDescriptorSetLayoutBinding bindings[] = {
@ -241,6 +242,7 @@ radv_process_color_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_i
                               const VkImageSubresourceRange *range, int level, int layer, enum radv_color_op op)
 {
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
   const struct radv_physical_device *pdev = radv_device_physical(device);
   struct radv_image_view iview;
   uint32_t width, height;
@ -303,9 +305,23 @@ radv_process_color_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_i
   radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
-   if (op == FMASK_DECOMPRESS || op == DCC_DECOMPRESS)
+   if (op == FMASK_DECOMPRESS || op == DCC_DECOMPRESS) {
      /* On GFX6-8, the CB FMASK cache writes corrupted data if cache lines are flushed after their
       * context has been retired. To avoid this, we must flush the CB metadata caches immediately
       * after every FMASK decompress.
       *
       * PAL only applies this workaround on GFX6 but GFX7-8 are also affected and that matches
       * RadeonSI.
       */
      if (pdev->info.gfx_level <= GFX8 && op == FMASK_DECOMPRESS) {
         radeon_begin(cmd_buffer->cs);
         radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
         radeon_end();
      }
      cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                                                            VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, 0, image, range);
   }
   const VkRenderingEndInfoKHR end_info = {
      .sType = VK_STRUCTURE_TYPE_RENDERING_END_INFO_KHR,
--- a/src/amd/vulkan/meta/radv_meta_resolve_cs.c
+++ b/src/amd/vulkan/meta/radv_meta_resolve_cs.c
@ -467,7 +467,9 @@ radv_meta_resolve_depth_stencil_cs(struct radv_cmd_buffer *cmd_buffer, struct ra
   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
-   const uint32_t push_constants[2] = {region->srcOffset.x, region->srcOffset.y};
+   const uint32_t push_constants[5] = {
      region->srcOffset.x, region->srcOffset.y, region->dstOffset.x, region->dstOffset.y, region->dstOffset.z,
   };
   const VkPushConstantsInfoKHR pc_info = {
      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
--- a/src/amd/vulkan/meta/radv_meta_resolve_fs.c
+++ b/src/amd/vulkan/meta/radv_meta_resolve_fs.c
@ -669,8 +669,8 @@ radv_meta_resolve_depth_stencil_fs(struct radv_cmd_buffer *cmd_buffer, struct ra
   radv_CmdSetViewport(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1,
                       &(VkViewport){
-                          .x = region->srcOffset.x,
+                          .x = region->dstOffset.x,
-                          .y = region->srcOffset.y,
+                          .y = region->dstOffset.y,
                          .width = region->extent.width,
                          .height = region->extent.height,
                          .minDepth = 0.0f,
@ -679,6 +679,22 @@ radv_meta_resolve_depth_stencil_fs(struct radv_cmd_buffer *cmd_buffer, struct ra
   radv_CmdSetScissor(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &resolve_area);
   const uint32_t push_constants[2] = {
      region->srcOffset.x - region->dstOffset.x,
      region->srcOffset.y - region->dstOffset.y,
   };
   const VkPushConstantsInfoKHR push_constants_info = {
      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO,
      .layout = layout,
      .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
      .offset = 0,
      .size = sizeof(push_constants),
      .pValues = push_constants,
   };
   radv_CmdPushConstants2(radv_cmd_buffer_to_handle(cmd_buffer), &push_constants_info);
   radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
   const VkRenderingEndInfoKHR end_info = {
--- a/src/amd/vulkan/nir/radv_meta_nir.c
+++ b/src/amd/vulkan/nir/radv_meta_nir.c
@ -1395,19 +1395,21 @@ radv_meta_nir_build_depth_stencil_resolve_compute_shader(struct radv_device *dev
   nir_def *global_id = radv_meta_nir_get_global_ids(&b, 3);
-   nir_def *offset = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 8);
+   nir_def *src_offset = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 8);
   nir_def *dst_offset = nir_load_push_constant(&b, 3, 32, nir_imm_int(&b, 8), .range = 20);
-   nir_def *resolve_coord = nir_iadd(&b, nir_trim_vector(&b, global_id, 2), offset);
+   nir_def *src_coord = nir_iadd(&b, nir_trim_vector(&b, global_id, 2), src_offset);
   nir_def *dst_coord = nir_iadd(&b, global_id, dst_offset);
-   nir_def *img_coord =
+   nir_def *src_img_coord =
-      nir_vec3(&b, nir_channel(&b, resolve_coord, 0), nir_channel(&b, resolve_coord, 1), nir_channel(&b, global_id, 2));
+      nir_vec3(&b, nir_channel(&b, src_coord, 0), nir_channel(&b, src_coord, 1), nir_channel(&b, global_id, 2));
   nir_deref_instr *input_img_deref = nir_build_deref_var(&b, input_img);
-   nir_def *outval = nir_txf_ms(&b, img_coord, nir_imm_int(&b, 0), .texture_deref = input_img_deref);
+   nir_def *outval = nir_txf_ms(&b, src_img_coord, nir_imm_int(&b, 0), .texture_deref = input_img_deref);
   if (resolve_mode != VK_RESOLVE_MODE_SAMPLE_ZERO_BIT) {
      for (int i = 1; i < samples; i++) {
-         nir_def *si = nir_txf_ms(&b, img_coord, nir_imm_int(&b, i), .texture_deref = input_img_deref);
+         nir_def *si = nir_txf_ms(&b, src_img_coord, nir_imm_int(&b, i), .texture_deref = input_img_deref);
         switch (resolve_mode) {
         case VK_RESOLVE_MODE_AVERAGE_BIT:
@ -1435,8 +1437,8 @@ radv_meta_nir_build_depth_stencil_resolve_compute_shader(struct radv_device *dev
         outval = nir_fdiv_imm(&b, outval, samples);
   }
-   nir_def *coord = nir_vec4(&b, nir_channel(&b, img_coord, 0), nir_channel(&b, img_coord, 1),
+   nir_def *coord = nir_vec4(&b, nir_channel(&b, dst_coord, 0), nir_channel(&b, dst_coord, 1),
-                             nir_channel(&b, img_coord, 2), nir_undef(&b, 1, 32));
+                             nir_channel(&b, dst_coord, 2), nir_undef(&b, 1, 32));
   nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_undef(&b, 1, 32), outval,
                         nir_imm_int(&b, 0), .image_dim = GLSL_SAMPLER_DIM_2D, .image_array = true);
   return b.shader;
@ -1495,10 +1497,11 @@ radv_meta_nir_build_depth_stencil_resolve_fragment_shader(struct radv_device *de
   fs_out->data.location = index == RADV_META_DEPTH_RESOLVE ? FRAG_RESULT_DEPTH : FRAG_RESULT_STENCIL;
   nir_def *pos_in = nir_trim_vector(&b, nir_load_frag_coord(&b), 2);
   nir_def *src_offset = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 8);
   nir_def *pos_int = nir_f2i32(&b, pos_in);
-   nir_def *img_coord = nir_trim_vector(&b, pos_int, 2);
+   nir_def *img_coord = nir_trim_vector(&b, nir_iadd(&b, pos_int, src_offset), 2);
   nir_deref_instr *input_img_deref = nir_build_deref_var(&b, input_img);
   nir_def *outval = nir_txf_ms(&b, img_coord, nir_imm_int(&b, 0), .texture_deref = input_img_deref);
--- a/src/amd/vulkan/nir/radv_nir_lower_call_abi.c
+++ b/src/amd/vulkan/nir/radv_nir_lower_call_abi.c
@ -114,11 +114,32 @@ gather_tail_call_instrs_block(nir_function *caller, const struct nir_block *bloc
      if (call->callee->num_params != caller->num_params)
         return;
-      for (unsigned i = 0; i < call->num_params; ++i) {
+      for (unsigned i = 0; i < call->callee->num_params; ++i) {
         if (call->callee->params[i].is_return != caller->params[i].is_return)
            return;
         if ((call->callee->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE) &&
             !(caller->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE))
            return;
         bool has_preserved_regs =
            (caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == ACO_NIR_CALL_ABI_AHIT_ISEC;
         if (has_preserved_regs && ((call->callee->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE) !=
                                    (caller->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE)))
            return;
         if (call->callee->params[i].is_uniform != caller->params[i].is_uniform)
            return;
         if (call->callee->params[i].bit_size != caller->params[i].bit_size)
            return;
         if (call->callee->params[i].num_components != caller->params[i].num_components)
            return;
      }
      /* The call instruction itself has not been lowered to the new signature yet, so do this in a separate loop and
       * adjust parameter indices for the caller.
       */
      for (unsigned i = 0; i < call->num_params; ++i) {
         unsigned caller_param_idx = i + ACO_NIR_CALL_SYSTEM_ARG_COUNT;
         /* We can only do tail calls if the caller returns exactly the callee return values */
-         if (caller->params[i].is_return) {
+         if (caller->params[caller_param_idx].is_return) {
            assert(nir_def_as_deref_or_null(call->params[i].ssa));
            nir_deref_instr *deref_root = nir_def_as_deref(call->params[i].ssa);
            while (nir_deref_instr_parent(deref_root))
@ -129,16 +150,18 @@ gather_tail_call_instrs_block(nir_function *caller, const struct nir_block *bloc
            nir_intrinsic_instr *intrin = nir_def_as_intrinsic_or_null(deref_root->parent.ssa);
            if (!intrin || intrin->intrinsic != nir_intrinsic_load_param)
               return;
-            /* The call parameters aren't lowered at this point, we need to add the call arg count here */
+            if (nir_intrinsic_param_idx(intrin) != caller_param_idx)
-            if (nir_intrinsic_param_idx(intrin) != i + ACO_NIR_CALL_SYSTEM_ARG_COUNT)
+               return;
         } else if (!(caller->params[caller_param_idx].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE)) {
            /* If the parameter is not marked as discardable, then we have to preserve the caller's value. Passing
             * a modified value to a tail call leaves us unable to restore the original value, so bail out if we have
             * modified parameters.
             */
            nir_intrinsic_instr *intrin = nir_def_as_intrinsic_or_null(call->params[i].ssa);
            if (!intrin || intrin->intrinsic != nir_intrinsic_load_param ||
                nir_intrinsic_param_idx(intrin) != caller_param_idx)
               return;
         }
         if (call->callee->params[i].is_uniform != caller->params[i].is_uniform)
            return;
         if (call->callee->params[i].bit_size != caller->params[i].bit_size)
            return;
         if (call->callee->params[i].num_components != caller->params[i].num_components)
            return;
      }
      _mesa_set_add(tail_calls, instr);
--- a/src/amd/vulkan/nir/radv_nir_lower_ray_queries.c
+++ b/src/amd/vulkan/nir/radv_nir_lower_ray_queries.c
@ -144,6 +144,7 @@ radv_get_ray_query_type()
 struct ray_query_vars {
   nir_variable *var;
   bool use_bvh_stack_rtn;
   bool shared_stack;
   uint32_t shared_base;
   uint32_t stack_entries;
@ -162,13 +163,24 @@ init_ray_query_vars(nir_shader *shader, const glsl_type *opaque_type, struct ray
   uint32_t shared_stack_entries = shader->info.ray_queries == 1 ? 16 : 8;
   /* ds_bvh_stack* instructions use a fixed stride of 32 dwords. */
   if (radv_use_bvh_stack_rtn(pdev))
-      workgroup_size = MAX2(workgroup_size, 32);
+      workgroup_size = align(workgroup_size, 32);
   uint32_t shared_stack_size = workgroup_size * shared_stack_entries * 4;
   uint32_t shared_offset = align(shader->info.shared_size, 4);
   if (shader->info.stage != MESA_SHADER_COMPUTE || glsl_type_is_array(opaque_type) ||
       shared_offset + shared_stack_size > pdev->max_shared_size) {
      dst->stack_entries = MAX_SCRATCH_STACK_ENTRY_COUNT;
   } else {
      if (radv_use_bvh_stack_rtn(pdev)) {
         /* The hardware ds_bvh_stack_rtn address can only encode a stack base up to 8191 dwords, or 16383 dwords on
          * gfx12+.
          */
         uint32_t num_wave32_groups = workgroup_size / 32;
         uint32_t max_group_stack_base = (num_wave32_groups - 1) * 32 * shared_stack_entries;
         uint32_t max_stack_base = (shared_offset / 4) + max_group_stack_base;
         uint32_t max_hw_stack_base = pdev->info.gfx_level >= GFX12 ? 16384 : 8192;
         dst->use_bvh_stack_rtn = max_stack_base < max_hw_stack_base;
      }
      dst->shared_stack = true;
      dst->shared_base = shared_offset;
      dst->stack_entries = shared_stack_entries;
@ -303,7 +315,7 @@ lower_rq_initialize(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query
   if (vars->shared_stack) {
      nir_def *stack_idx = nir_load_local_invocation_index(b);
-      if (radv_use_bvh_stack_rtn(pdev)) {
+      if (vars->use_bvh_stack_rtn) {
         uint32_t workgroup_size =
            b->shader->info.workgroup_size[0] * b->shader->info.workgroup_size[1] * b->shader->info.workgroup_size[2];
         nir_def *addr =
@ -312,7 +324,6 @@ lower_rq_initialize(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query
         rq_store(b, rq, trav_stack_low_watermark, addr);
      } else {
         nir_def *base_offset = nir_imul_imm(b, stack_idx, sizeof(uint32_t));
         base_offset = nir_iadd_imm(b, base_offset, vars->shared_base);
         rq_store(b, rq, trav_stack, base_offset);
         rq_store(b, rq, trav_stack_low_watermark, base_offset);
      }
@ -482,7 +493,7 @@ store_stack_entry(nir_builder *b, nir_def *index, nir_def *value, const struct r
   struct traversal_data *data = args->data;
   if (data->vars->shared_stack)
-      nir_store_shared(b, value, index, .base = 0, .align_mul = 4);
+      nir_store_shared(b, value, index, .base = data->vars->shared_base, .align_mul = 4);
   else
      nir_store_deref(b, nir_build_deref_array(b, rq_deref(b, data->rq, stack), index), value, 0x1);
 }
@ -493,7 +504,7 @@ load_stack_entry(nir_builder *b, nir_def *index, const struct radv_ray_traversal
   struct traversal_data *data = args->data;
   if (data->vars->shared_stack)
-      return nir_load_shared(b, 1, 32, index, .base = 0, .align_mul = 4);
+      return nir_load_shared(b, 1, 32, index, .base = data->vars->shared_base, .align_mul = 4);
   else
      return nir_load_deref(b, nir_build_deref_array(b, rq_deref(b, data->rq, stack), index));
 }
@ -563,19 +574,16 @@ lower_rq_proceed(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query_va
   };
   if (vars->shared_stack) {
-      args.use_bvh_stack_rtn = radv_use_bvh_stack_rtn(pdev);
+      args.use_bvh_stack_rtn = vars->use_bvh_stack_rtn;
      if (args.use_bvh_stack_rtn) {
         args.stack_stride = 1;
         args.stack_base = 0;
      } else {
         uint32_t workgroup_size =
            b->shader->info.workgroup_size[0] * b->shader->info.workgroup_size[1] * b->shader->info.workgroup_size[2];
         args.stack_stride = workgroup_size * 4;
         args.stack_base = vars->shared_base;
      }
   } else {
      args.stack_stride = 1;
      args.stack_base = 0;
   }
   rq_store(b, rq, break_flag, nir_imm_false(b));
--- a/src/amd/vulkan/nir/radv_nir_rt_common.c
+++ b/src/amd/vulkan/nir/radv_nir_rt_common.c
@ -560,15 +560,12 @@ create_bvh_descriptor(nir_builder *b, const struct radv_physical_device *pdev, s
      /* Enable pointer flags on GFX11+ */
      dword3 |= BITFIELD_BIT(119 - 96);
-      /* Instead of the default box sorting (closest point), use largest for terminate_on_first_hit rays and midpoint
+      /* Instead of the default box sorting (closest point), use largest for terminate_on_first_hit rays;
-       * for closest hit; this makes it more likely that the ray traversal will visit fewer nodes. */
+       * this makes it more likely that the ray traversal will visit fewer nodes. */
      const uint32_t box_sort_largest = 1;
      const uint32_t box_sort_midpoint = 2;
-      /* Only use largest/midpoint sorting when all invocations have the same ray flags, otherwise
+      /* Only use largest sorting when all invocations have the same ray flags, otherwise
       * fall back to the default closest point. */
      dword1 = nir_bcsel(b, nir_vote_any(b, 1, ray_flags->terminate_on_first_hit), dword1,
                         nir_imm_int(b, (box_sort_midpoint << 21) | sort_triangles_first | box_sort_enable));
      dword1 = nir_bcsel(b, nir_vote_all(b, 1, ray_flags->terminate_on_first_hit),
                         nir_imm_int(b, (box_sort_largest << 21) | sort_triangles_first | box_sort_enable), dword1);
   }
@ -878,7 +875,7 @@ radv_build_ray_traversal(struct radv_device *device, nir_builder *b, const struc
         /* Early exit if we never overflowed the stack, to avoid having to backtrack to
          * the root for no reason. */
         if (!args->use_bvh_stack_rtn) {
-            nir_push_if(b, nir_ilt_imm(b, nir_load_deref(b, args->vars.stack), args->stack_base + args->stack_stride));
+            nir_push_if(b, nir_ilt_imm(b, nir_load_deref(b, args->vars.stack), args->stack_stride));
            {
               nir_store_var(b, incomplete, nir_imm_false(b), 0x1);
               nir_jump(b, nir_jump_break);
@ -1174,7 +1171,7 @@ radv_build_ray_traversal_gfx12(struct radv_device *device, nir_builder *b, const
         /* Early exit if we never overflowed the stack, to avoid having to backtrack to
          * the root for no reason. */
         if (!args->use_bvh_stack_rtn) {
-            nir_push_if(b, nir_ilt_imm(b, nir_load_deref(b, args->vars.stack), args->stack_base + args->stack_stride));
+            nir_push_if(b, nir_ilt_imm(b, nir_load_deref(b, args->vars.stack), args->stack_stride));
            {
               nir_store_var(b, incomplete, nir_imm_false(b), 0x1);
               nir_jump(b, nir_jump_break);
--- a/src/amd/vulkan/nir/radv_nir_rt_common.h
+++ b/src/amd/vulkan/nir/radv_nir_rt_common.h
@ -135,10 +135,9 @@ struct radv_ray_traversal_args {
   struct radv_ray_traversal_vars vars;
   /* The increment/decrement used for radv_ray_traversal_vars::stack, and how many entries are
-    * available. stack_base is the base address of the stack. */
+    * available. */
   uint32_t stack_stride;
   uint32_t stack_entries;
   uint32_t stack_base;
   uint32_t set_flags;
   uint32_t unset_flags;
--- a/src/amd/vulkan/nir/radv_nir_rt_stage_functions.c
+++ b/src/amd/vulkan/nir/radv_nir_rt_stage_functions.c
@ -39,7 +39,7 @@ radv_nir_init_traversal_params(nir_function *function, unsigned payload_size)
   function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params);
   radv_nir_init_common_rt_params(function);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_TRAVERSAL_ADDR, glsl_uint64_t_type(), true, 0);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_SHADER_RECORD_PTR, glsl_uint64_t_type(), false, 0);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_SHADER_RECORD_PTR, glsl_uint64_t_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_ACCEL_STRUCT, glsl_uint64_t_type(), false, 0);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_CULL_MASK_AND_FLAGS, glsl_uint_type(), false, 0);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_SBT_OFFSET, glsl_uint_type(), false, 0);
@ -49,12 +49,13 @@ radv_nir_init_traversal_params(nir_function *function, unsigned payload_size)
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_RAY_TMIN, glsl_float_type(), false, 0);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_RAY_DIRECTION, glsl_vector_type(GLSL_TYPE_UINT, 3), false,
                            0);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_RAY_TMAX, glsl_float_type(), false, 0);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_RAY_TMAX, glsl_float_type(), false,
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_PRIMITIVE_ADDR, glsl_uint64_t_type(), false, 0);
+                            ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_PRIMITIVE_ID, glsl_uint_type(), false, 0);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_PRIMITIVE_ADDR, glsl_uint64_t_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_INSTANCE_ADDR, glsl_uint64_t_type(), false, 0);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_PRIMITIVE_ID, glsl_uint_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS, glsl_uint_type(), false, 0);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_INSTANCE_ADDR, glsl_uint64_t_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_HIT_KIND, glsl_uint_type(), false, 0);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS, glsl_uint_type(), false,  ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_HIT_KIND, glsl_uint_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
   for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) {
      radv_nir_return_param_from_type(function->params + TRAVERSAL_ARG_PAYLOAD_BASE + i, glsl_uint_type(), false, 0);
   }
@ -128,15 +129,11 @@ radv_nir_init_rt_function_params(nir_function *function, mesa_shader_stage stage
      radv_nir_init_common_rt_params(function);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_TRAVERSAL_ADDR, glsl_uint64_t_type(), true, 0);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SHADER_RECORD_PTR, glsl_uint64_t_type(), false, 0);
-      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_ACCEL_STRUCT, glsl_uint64_t_type(), false,
+      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_ACCEL_STRUCT, glsl_uint64_t_type(), false, 0);
                               ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_CULL_MASK_AND_FLAGS, glsl_uint_type(), false, 0);
-      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SBT_OFFSET, glsl_uint_type(), false,
+      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SBT_OFFSET, glsl_uint_type(), false, 0);
-                               ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
+      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SBT_STRIDE, glsl_uint_type(), false, 0);
-      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SBT_STRIDE, glsl_uint_type(), false,
+      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_MISS_INDEX, glsl_uint_type(), false, 0);
                               ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_MISS_INDEX, glsl_uint_type(), false,
                               ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_RAY_ORIGIN, glsl_vector_type(GLSL_TYPE_UINT, 3), false,
                               0);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_RAY_TMIN, glsl_float_type(), false, 0);
--- a/src/amd/vulkan/nir/radv_nir_rt_traversal_shader.c
+++ b/src/amd/vulkan/nir/radv_nir_rt_traversal_shader.c
@ -1251,7 +1251,6 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
      .vars = trav_vars_args,
      .stack_stride = stack_stride,
      .stack_entries = MAX_STACK_ENTRY_COUNT,
      .stack_base = 0,
      .ignore_cull_mask = params->ignore_cull_mask,
      .set_flags = info ? info->set_flags : 0,
      .unset_flags = info ? info->unset_flags : 0,
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -9550,9 +9550,9 @@ radv_handle_color_fbfetch_output(struct radv_cmd_buffer *cmd_buffer, uint32_t in
   radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_UNKNOWN_REASON);
   /* Force a transition to FEEDBACK_LOOP_OPTIMAL to decompress DCC. */
-   radv_handle_image_transition(cmd_buffer, att->iview->image, att->layout,
+   radv_handle_rendering_image_transition(
-                                VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, RADV_QUEUE_GENERAL,
+      cmd_buffer, att->iview, render->layer_count, render->view_mask, att->layout, VK_IMAGE_LAYOUT_UNDEFINED,
-                                RADV_QUEUE_GENERAL, &range, NULL);
+      VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, VK_IMAGE_LAYOUT_UNDEFINED, NULL);
   radv_describe_barrier_end(cmd_buffer);
@ -9597,9 +9597,10 @@ radv_handle_depth_fbfetch_output(struct radv_cmd_buffer *cmd_buffer)
   radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_UNKNOWN_REASON);
   /* Force a transition to FEEDBACK_LOOP_OPTIMAL to decompress HTILE. */
-   radv_handle_image_transition(cmd_buffer, att->iview->image, att->layout,
+   radv_handle_rendering_image_transition(cmd_buffer, att->iview, render->layer_count, render->view_mask, att->layout,
-                                VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, RADV_QUEUE_GENERAL,
+                                          att->stencil_layout, VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT,
-                                RADV_QUEUE_GENERAL, &range, NULL);
+                                          VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT,
                                          render->sample_locations.count > 0 ? &render->sample_locations : NULL);
   radv_describe_barrier_end(cmd_buffer);
@ -9642,16 +9643,19 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
   VK_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
   struct radv_device *device = radv_cmd_buffer_device(primary);
   const struct radv_physical_device *pdev = radv_device_physical(device);
   const bool is_gfx_or_ace = primary->qf == RADV_QUEUE_GENERAL || primary->qf == RADV_QUEUE_COMPUTE;
   assert(commandBufferCount > 0);
-   radv_emit_mip_change_flush_default(primary);
+   if (is_gfx_or_ace) {
      radv_emit_mip_change_flush_default(primary);
-   /* Emit pending flushes on primary prior to executing secondary */
+      /* Emit pending flushes on primary prior to executing secondary */
-   radv_emit_cache_flush(primary);
+      radv_emit_cache_flush(primary);
-   /* Make sure CP DMA is idle on primary prior to executing secondary. */
+      /* Make sure CP DMA is idle on primary prior to executing secondary. */
-   radv_cp_dma_wait_for_idle(primary);
+      radv_cp_dma_wait_for_idle(primary);
   }
   for (uint32_t i = 0; i < commandBufferCount; i++) {
      VK_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
@ -9694,6 +9698,9 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
         if (primary->state.dirty & RADV_CMD_DIRTY_FBFETCH_OUTPUT) {
            radv_handle_fbfetch_output(primary);
            primary->state.dirty &= ~RADV_CMD_DIRTY_FBFETCH_OUTPUT;
            /* Emit pending flushes if a late decompression was performed. */
            radv_emit_cache_flush(primary);
         }
         if (primary->state.render.active && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
@ -9769,23 +9776,12 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
      device->ws->cs_execute_secondary(primary_cs->b, secondary_cs->b, allow_ib2);
-      /* When the secondary command buffer is compute only we don't
+      primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
-       * need to re-emit the current graphics pipeline.
+      primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
-       */
+      primary->state.emitted_rt_pipeline = secondary->state.emitted_rt_pipeline;
      if (secondary->state.emitted_graphics_pipeline) {
         primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
      }
-      /* When the secondary command buffer is graphics only we don't
+      primary->state.ps_epilog = secondary->state.ps_epilog;
-       * need to re-emit the current compute pipeline.
+      primary->state.emitted_vs_prolog = secondary->state.emitted_vs_prolog;
       */
      if (secondary->state.emitted_compute_pipeline) {
         primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
      }
      if (secondary->state.emitted_rt_pipeline) {
         primary->state.emitted_rt_pipeline = secondary->state.emitted_rt_pipeline;
      }
      if (secondary->state.last_ia_multi_vgt_param) {
         primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
@ -10389,13 +10385,17 @@ radv_cs_emit_compute_predication(const struct radv_device *device, struct radv_c
 }
 ALWAYS_INLINE static void
-radv_gfx12_emit_hiz_wa(const struct radv_device *device, const struct radv_cmd_state *cmd_state,
+radv_gfx12_emit_wa(const struct radv_device *device, const struct radv_cmd_state *cmd_state, struct radv_cmd_stream *cs)
                       struct radv_cmd_stream *cs)
 {
   const struct radv_physical_device *pdev = radv_device_physical(device);
   const struct radv_rendering_state *render = &cmd_state->render;
   const bool hiz_partial_wa_enabled = pdev->gfx12_hiz_wa == RADV_GFX12_HIZ_WA_PARTIAL && render->gfx12_has_hiz;
   const bool vrs_export_wa_enabled = pdev->info.has_vrs_export_bug && cmd_state->last_vgt_shader &&
                                      cmd_state->last_vgt_shader->info.outinfo.writes_primitive_shading_rate;
-   if (pdev->gfx12_hiz_wa == RADV_GFX12_HIZ_WA_PARTIAL && render->gfx12_has_hiz) {
+   /* Emit BOP events to mitigate some hardware bugs on GFX12. */
   if (hiz_partial_wa_enabled || vrs_export_wa_enabled) {
      assert(pdev->info.gfx_level == GFX12);
      radeon_begin(cs);
      radeon_emit(PKT3(PKT3_RELEASE_MEM, 6, 0));
      radeon_emit(S_490_EVENT_TYPE(V_028A90_BOTTOM_OF_PIPE_TS) | S_490_EVENT_INDEX(5));
@ -10421,7 +10421,7 @@ radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_cou
   radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
   radeon_end();
-   radv_gfx12_emit_hiz_wa(device, &cmd_buffer->state, cs);
+   radv_gfx12_emit_wa(device, &cmd_buffer->state, cs);
 }
 /**
@ -10451,7 +10451,7 @@ radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t in
   radeon_emit(V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
   radeon_end();
-   radv_gfx12_emit_hiz_wa(device, &cmd_buffer->state, cs);
+   radv_gfx12_emit_wa(device, &cmd_buffer->state, cs);
 }
 /* MUST inline this function to avoid massive perf loss in drawoverhead */
@ -10503,7 +10503,7 @@ radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool index
   radeon_end();
-   radv_gfx12_emit_hiz_wa(device, &cmd_buffer->state, cs);
+   radv_gfx12_emit_wa(device, &cmd_buffer->state, cs);
   cmd_buffer->state.uses_draw_indirect = true;
 }
@ -10549,7 +10549,7 @@ radv_cs_emit_indirect_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint3
   radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
   radeon_end();
-   radv_gfx12_emit_hiz_wa(device, &cmd_buffer->state, cs);
+   radv_gfx12_emit_wa(device, &cmd_buffer->state, cs);
 }
 ALWAYS_INLINE static void
@ -10633,7 +10633,7 @@ radv_cs_emit_dispatch_taskmesh_gfx_packet(const struct radv_device *device, cons
   radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX);
   radeon_end();
-   radv_gfx12_emit_hiz_wa(device, cmd_state, cs);
+   radv_gfx12_emit_wa(device, cmd_state, cs);
 }
 ALWAYS_INLINE static void
@ -10937,7 +10937,7 @@ radv_cs_emit_mesh_dispatch_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t x
   radeon_emit(S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX));
   radeon_end();
-   radv_gfx12_emit_hiz_wa(device, &cmd_buffer->state, cs);
+   radv_gfx12_emit_wa(device, &cmd_buffer->state, cs);
 }
 ALWAYS_INLINE static void
@ -15174,10 +15174,19 @@ radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstC
   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
-   if (pdev->info.gfx_level >= GFX12)
+   if (pdev->info.gfx_level >= GFX12) {
      radv_init_streamout_state(cmd_buffer);
-   else if (!pdev->use_ngg_streamout)
+
      /* Invalidate L2 in case the buffer filled size needs to be saved because COPY_DATA isn't
       * coherent with L2.
       */
      if (pdev->info.cp_sdma_ge_use_system_memory_scope) {
         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2;
         radv_emit_cache_flush(cmd_buffer);
      }
   } else if (!pdev->use_ngg_streamout) {
      radv_flush_vgt_streamout(cmd_buffer);
   }
   ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs->b, MAX_SO_BUFFERS * 10);
--- a/src/amd/vulkan/radv_debug.c
+++ b/src/amd/vulkan/radv_debug.c
@ -390,8 +390,8 @@ static void
 radv_add_split_disasm(const char *disasm, uint64_t start_addr, unsigned *num, struct radv_shader_inst *instructions)
 {
   struct radv_shader_inst *last_inst = *num ? &instructions[*num - 1] : NULL;
-   char *next;
+   const char *next;
-   char *repeat = strstr(disasm, "then repeated");
+   const char *repeat = strstr(disasm, "then repeated");
   while ((next = strchr(disasm, '\n'))) {
      struct radv_shader_inst *inst = &instructions[*num];
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@ -786,6 +786,8 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd
      add_entrypoints(&b, &quantic_dream_device_entrypoints, RADV_APP_DISPATCH_TABLE);
   } else if (!strcmp(instance->drirc.debug.app_layer, "no_mans_sky")) {
      add_entrypoints(&b, &no_mans_sky_device_entrypoints, RADV_APP_DISPATCH_TABLE);
   } else if (!strcmp(instance->drirc.debug.app_layer, "strange_brigade")) {
      add_entrypoints(&b, &strange_brigade_device_entrypoints, RADV_APP_DISPATCH_TABLE);
   }
   if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
@ -1239,7 +1241,13 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr
   device->ws = pdev->ws;
   device->vk.sync = device->ws->get_sync_provider(device->ws);
-   device->vk.copy_sync_payloads = pdev->ws->copy_sync_payloads;
+
   /* Disable unordered submits when SQTT queue events are enabled because queue present events
    * might be missing otherwise.
    */
   device->vk.copy_sync_payloads = ((instance->vk.trace_mode & RADV_TRACE_MODE_RGP) && radv_sqtt_queue_events_enabled())
                                      ? NULL
                                      : pdev->ws->copy_sync_payloads;
   /* Enable the global BO list by default. */
   /* TODO: Remove the per cmdbuf BO list tracking after few Mesa releases if no blockers. */
--- a/src/amd/vulkan/radv_image_view.c
+++ b/src/amd/vulkan/radv_image_view.c
@ -500,9 +500,9 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,
   if (!extra_create_info || !extra_create_info->from_client)
      assert(pCreateInfo->flags & VK_IMAGE_VIEW_CREATE_DRIVER_INTERNAL_BIT_MESA);
   vk_image_view_init(&device->vk, &iview->vk, pCreateInfo);
-   memset(&iview->descriptor, 0, sizeof(iview->descriptor));
+   memset(iview, 0, sizeof(*iview));
   vk_image_view_init(&device->vk, &iview->vk, pCreateInfo);
   iview->image = image;
   iview->plane_id = radv_plane_from_aspect(pCreateInfo->subresourceRange.aspectMask);
@ -664,13 +664,13 @@ radv_hiz_image_view_init(struct radv_image_view *iview, struct radv_device *devi
   VK_FROM_HANDLE(radv_image, image, pCreateInfo->image);
   assert(pCreateInfo->flags & VK_IMAGE_VIEW_CREATE_DRIVER_INTERNAL_BIT_MESA);
   memset(iview, 0, sizeof(*iview));
   vk_image_view_init(&device->vk, &iview->vk, pCreateInfo);
   assert(vk_format_has_depth(image->vk.format) && vk_format_has_stencil(image->vk.format));
   assert(iview->vk.aspects == VK_IMAGE_ASPECT_DEPTH_BIT);
   memset(&iview->descriptor, 0, sizeof(iview->descriptor));
   iview->image = image;
   const uint32_t type =
--- a/src/amd/vulkan/radv_pipeline_graphics.c
+++ b/src/amd/vulkan/radv_pipeline_graphics.c
@ -1662,7 +1662,7 @@ radv_graphics_shaders_link_varyings(struct radv_shader_stage *stages, enum amd_g
      /* Scalarize all I/O, because nir_opt_varyings and nir_opt_vectorize_io expect all I/O to be scalarized. */
      nir_variable_mode sca_mode = nir_var_shader_in;
-      bool sca_progress;
+      bool sca_progress = false;
      if (s != MESA_SHADER_FRAGMENT)
         sca_mode |= nir_var_shader_out;
--- a/src/amd/vulkan/radv_pipeline_rt.c
+++ b/src/amd/vulkan/radv_pipeline_rt.c
@ -409,16 +409,16 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
   stage->info.inline_push_constant_mask = stage->args.ac.inline_push_const_mask;
   stage->info.type = radv_is_traversal_shader(stage->nir) ? RADV_SHADER_TYPE_RT_TRAVERSAL : RADV_SHADER_TYPE_DEFAULT;
   /* Move ray tracing system values to the top that are set by rt_trace_ray
    * to prevent them from being overwritten by other rt_trace_ray calls.
    */
   NIR_PASS(_, stage->nir, move_rt_instructions);
   uint32_t num_resume_shaders = 0;
   nir_shader **resume_shaders = NULL;
   void *mem_ctx = ralloc_context(NULL);
   if (stage->stage != MESA_SHADER_INTERSECTION && mode == RADV_RT_LOWERING_MODE_CPS) {
      /* Move ray tracing system values to the top that are set by rt_trace_ray
       * to prevent them from being overwritten by other rt_trace_ray calls.
       */
      NIR_PASS(_, stage->nir, move_rt_instructions);
      nir_builder b = nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(stage->nir)));
      nir_rt_return_amd(&b);
@ -541,6 +541,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
   if (dump_shader)
      simple_mtx_unlock(&instance->shader_dump_mtx);
   ralloc_free(mem_ctx);
   free(binary);
   *out_shader = shader;
@ -674,7 +675,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
   bool can_use_monolithic = !library && pipeline->stage_count < 50;
   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
-      if (rt_stages[i].shader || rt_stages[i].nir)
+      if (rt_stages[i].nir)
         continue;
      int64_t stage_start = os_time_get_nano();
@ -749,7 +750,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
   inline_any_hit_shaders |= raygen_lowering_mode == RADV_RT_LOWERING_MODE_MONOLITHIC && !raygen_imported;
   for (uint32_t idx = 0; idx < pCreateInfo->stageCount; idx++) {
-      if (rt_stages[idx].shader || rt_stages[idx].nir)
+      if (rt_stages[idx].nir)
         continue;
      int64_t stage_start = os_time_get_nano();
@ -1462,17 +1463,39 @@ radv_GetRayTracingShaderGroupStackSizeKHR(VkDevice device, VkPipeline _pipeline,
   VK_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
   struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
   struct radv_ray_tracing_group *rt_group = &rt_pipeline->groups[group];
   struct radv_ray_tracing_stage *shader_stage;
   switch (groupShader) {
   case VK_SHADER_GROUP_SHADER_GENERAL_KHR:
   case VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR:
-      return rt_pipeline->stages[rt_group->recursive_shader].stack_size;
+      shader_stage = &rt_pipeline->stages[rt_group->recursive_shader];
      break;
   case VK_SHADER_GROUP_SHADER_ANY_HIT_KHR:
-      return rt_pipeline->stages[rt_group->any_hit_shader].stack_size;
+      /* If the any-hit shader is inlined into an intersection shader, there is no stack specific to the any-hit shader
       * and all stack will be allocated for the intersection shader instead.
       */
      if (rt_group->intersection_shader != VK_SHADER_UNUSED_KHR)
         return 0;
      shader_stage = &rt_pipeline->stages[rt_group->any_hit_shader];
      break;
   case VK_SHADER_GROUP_SHADER_INTERSECTION_KHR:
-      return rt_pipeline->stages[rt_group->intersection_shader].stack_size;
+      shader_stage = &rt_pipeline->stages[rt_group->intersection_shader];
      break;
   default:
      return 0;
   }
   uint32_t stack_size = shader_stage->stack_size;
   /* Applications need to allocate stack for the traversal shader, too. The API doesn't intend for a constant
    * traversal stack size, so add the stack size to every shader potentially called by the traversal shader.
    * Applications are expected to max() shader stages together, so this shouldn't result in any unnecessary stack
    * usage.
    */
   if (shader_stage->stage == MESA_SHADER_CLOSEST_HIT || shader_stage->stage == MESA_SHADER_ANY_HIT ||
       shader_stage->stage == MESA_SHADER_INTERSECTION || shader_stage->stage == MESA_SHADER_MISS)
      stack_size += rt_pipeline->traversal_stack_size;
   return stack_size;
 }
 VKAPI_ATTR VkResult VKAPI_CALL
--- a/src/amd/vulkan/radv_rra.c
+++ b/src/amd/vulkan/radv_rra.c
@ -790,7 +790,7 @@ rra_map_accel_struct_data(struct rra_copy_context *ctx, uint32_t i)
   if (radv_GetEventStatus(ctx->device, data->build_event) != VK_EVENT_SET)
      return NULL;
-   if (data->buffer->memory) {
+   if (data->buffer && data->buffer->memory) {
      VkMemoryMapInfo memory_map_info = {
         .sType = VK_STRUCTURE_TYPE_MEMORY_MAP_INFO,
         .memory = data->buffer->memory,
--- a/src/amd/vulkan/radv_sdma.c
+++ b/src/amd/vulkan/radv_sdma.c
@ -216,6 +216,7 @@ radv_sdma_get_surf(const struct radv_device *const device, const struct radv_ima
      .texel_scale = radv_sdma_get_texel_scale(image),
      .is_linear = surf->is_linear,
      .is_3d = surf->u.gfx9.resource_type == RADEON_RESOURCE_3D,
      .is_stencil = subresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT,
   };
   const uint64_t surf_offset = (subresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) ? surf->u.gfx9.zs.stencil_offset
@ -371,6 +372,7 @@ radv_sdma_emit_copy_tiled_sub_window(const struct radv_device *device, struct ra
      .va = tiled->va,
      .format = radv_format_to_pipe_format(tiled->aspect_format),
      .bpp = tiled->bpp,
      .is_stencil = tiled->is_stencil,
      .offset =
         {
            .x = tiled_off.x,
@ -414,6 +416,7 @@ radv_sdma_emit_copy_t2t_sub_window(const struct radv_device *device, struct radv
      .va = src->va,
      .format = radv_format_to_pipe_format(src->aspect_format),
      .bpp = src->bpp,
      .is_stencil = src->is_stencil,
      .offset =
         {
            .x = src_off.x,
@ -439,6 +442,7 @@ radv_sdma_emit_copy_t2t_sub_window(const struct radv_device *device, struct radv
      .va = dst->va,
      .format = radv_format_to_pipe_format(dst->aspect_format),
      .bpp = dst->bpp,
      .is_stencil = dst->is_stencil,
      .offset =
         {
            .x = dst_off.x,
@ -606,12 +610,6 @@ radv_sdma_use_t2t_scanline_copy(const struct radv_device *device, const struct r
         return true;
   }
   /* The two images can have a different block size,
    * but must have the same swizzle mode.
    */
   if (src->micro_tile_mode != dst->micro_tile_mode)
      return true;
   /* The T2T subwindow copy packet only has fields for one metadata configuration.
    * It can either compress or decompress, or copy uncompressed images, but it
    * can't copy from a compressed image to another.
@ -619,6 +617,16 @@ radv_sdma_use_t2t_scanline_copy(const struct radv_device *device, const struct r
   if (src->is_compressed && dst->is_compressed)
      return true;
   if (ver >= SDMA_7_0) {
      /* No support for tiling format transformation at all. */
      if (src->surf->u.gfx9.swizzle_mode != dst->surf->u.gfx9.swizzle_mode)
         return true;
   } else {
      /* The two images can have a different block size, but must have the same swizzle mode. */
      if (src->micro_tile_mode != dst->micro_tile_mode)
         return true;
   }
   const bool needs_3d_alignment = src->is_3d && (src->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY ||
                                                  src->micro_tile_mode == RADEON_MICRO_MODE_STANDARD);
   const unsigned log2bpp = util_logbase2(src->bpp);
--- a/src/amd/vulkan/radv_sdma.h
+++ b/src/amd/vulkan/radv_sdma.h
@ -31,6 +31,7 @@ struct radv_sdma_surf {
   uint8_t texel_scale;     /* Texel scale for 96-bit formats */
   bool is_linear;          /* Whether the image is linear. */
   bool is_3d;              /* Whether the image is 3-dimensional. */
   bool is_stencil;         /* Whether the image is stencil only. */
   union {
      /* linear images only */
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -655,15 +655,24 @@ radv_shader_spirv_to_nir(struct radv_device *device, const struct radv_shader_st
      NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
   }
   bool lower_local_invocation_index = false;
   if (nir->info.derivative_group == DERIVATIVE_GROUP_QUADS &&
       ((nir->info.stage == MESA_SHADER_COMPUTE || nir->info.stage == MESA_SHADER_TASK ||
         (nir->info.stage == MESA_SHADER_MESH && pdev->info.mesh_fast_launch_2)))) {
      lower_local_invocation_index = true;
   } else if (nir->info.stage == MESA_SHADER_COMPUTE &&
              (((nir->info.workgroup_size[0] == 1) + (nir->info.workgroup_size[1] == 1) +
                (nir->info.workgroup_size[2] == 1)) == 2)) {
      lower_local_invocation_index = true;
   }
   nir_lower_compute_system_values_options csv_options = {
      /* Mesh shaders run as NGG which can implement local_invocation_index from
       * the wave ID in merged_wave_info, but they don't have local_invocation_ids on GFX10.3.
       */
      .lower_cs_local_id_to_index = nir->info.stage == MESA_SHADER_MESH && !pdev->info.mesh_fast_launch_2,
-      .lower_local_invocation_index = nir->info.stage == MESA_SHADER_COMPUTE &&
+      .lower_local_invocation_index = lower_local_invocation_index,
                                      ((((nir->info.workgroup_size[0] == 1) + (nir->info.workgroup_size[1] == 1) +
                                         (nir->info.workgroup_size[2] == 1)) == 2) ||
                                       nir->info.derivative_group == DERIVATIVE_GROUP_QUADS),
   };
   NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
--- a/src/amd/vulkan/radv_video.c
+++ b/src/amd/vulkan/radv_video.c
@ -950,8 +950,8 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
      struct VkVideoDecodeH265CapabilitiesKHR *ext =
         vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_H265_CAPABILITIES_KHR);
-      pCapabilities->maxDpbSlots = RADV_VIDEO_H264_MAX_DPB_SLOTS;
+      pCapabilities->maxDpbSlots = RADV_VIDEO_H265_MAX_DPB_SLOTS;
-      pCapabilities->maxActiveReferencePictures = RADV_VIDEO_H264_MAX_NUM_REF_FRAME;
+      pCapabilities->maxActiveReferencePictures = RADV_VIDEO_H265_MAX_NUM_REF_FRAME;
      /* for h265 on navi21+ separate dpb images should work */
      if (radv_enable_tier2(pdev))
         pCapabilities->flags |= VK_VIDEO_CAPABILITY_SEPARATE_REFERENCE_IMAGES_BIT_KHR;
@ -1120,7 +1120,7 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
         enc_caps->encodeInputPictureGranularity = pCapabilities->pictureAccessGranularity;
      ext->maxTiles.width = 2;
      ext->maxTiles.height = 16;
-      ext->minTileSize.width = 64;
+      ext->minTileSize.width = pdev->enc_hw_ver >= RADV_VIDEO_ENC_HW_5 ? 256 : 128;
      ext->minTileSize.height = 64;
      ext->maxTileSize.width = 4096;
      ext->maxTileSize.height = 4096;
@ -2320,22 +2320,6 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v
   result.tx_mode = pi->TxMode;
   result.reference_mode = (pi->flags.reference_select == 1) ? 2 : 0;
   if (pi->pTileInfo) {
      result.tile_cols = pi->pTileInfo->TileCols;
      result.tile_rows = pi->pTileInfo->TileRows;
      result.tile_size_bytes = pi->pTileInfo->tile_size_bytes_minus_1;
      result.context_update_tile_id = pi->pTileInfo->context_update_tile_id;
      for (i = 0; i < result.tile_cols; i++)
         result.tile_col_start_sb[i] = pi->pTileInfo->pMiColStarts[i];
      result.tile_col_start_sb[result.tile_cols] =
         result.tile_col_start_sb[result.tile_cols - 1] + pi->pTileInfo->pWidthInSbsMinus1[result.tile_cols - 1] + 1;
      for (i = 0; i < pi->pTileInfo->TileRows; i++)
         result.tile_row_start_sb[i] = pi->pTileInfo->pMiRowStarts[i];
      result.tile_row_start_sb[result.tile_rows] =
         result.tile_row_start_sb[result.tile_rows - 1] + pi->pTileInfo->pHeightInSbsMinus1[result.tile_rows - 1] + 1;
   }
   result.max_width = seq_hdr->max_frame_width_minus_1 + 1;
   result.max_height = seq_hdr->max_frame_height_minus_1 + 1;
   VkExtent2D frameExtent = frame_info->dstPictureResource.codedExtent;
@ -2351,6 +2335,44 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v
   result.superres_upscaled_width = frameExtent.width;
   if (pi->pTileInfo) {
      result.tile_cols = pi->pTileInfo->TileCols;
      result.tile_rows = pi->pTileInfo->TileRows;
      result.tile_size_bytes = pi->pTileInfo->tile_size_bytes_minus_1;
      result.context_update_tile_id = pi->pTileInfo->context_update_tile_id;
      /* pMi{Row,Col}Starts is unreliable, some apps send SB, some send MI, so use
       * p{Width,Height}InSbsMinus1 instead. But for uniform_tile_spacing_flag,
       * those are not defined by spec. */
      if (pi->pTileInfo->flags.uniform_tile_spacing_flag) {
         const unsigned sb_size = seq_hdr->flags.use_128x128_superblock ? 128 : 64;
         const unsigned sb_width = DIV_ROUND_UP(result.width, sb_size);
         const unsigned sb_height = DIV_ROUND_UP(result.height, sb_size);
         const unsigned tile_width_sb = DIV_ROUND_UP(sb_width, result.tile_cols);
         const unsigned tile_height_sb = DIV_ROUND_UP(sb_height, result.tile_rows);
         result.tile_col_start_sb[0] = 0;
         for (i = 1; i < result.tile_cols; ++i)
            result.tile_col_start_sb[i] = result.tile_col_start_sb[i - 1] + tile_width_sb;
         result.tile_col_start_sb[i] = sb_width;
         result.tile_row_start_sb[0] = 0;
         for (i = 1; i < result.tile_rows; ++i)
            result.tile_row_start_sb[i] = result.tile_row_start_sb[i - 1] + tile_height_sb;
         result.tile_row_start_sb[i] = sb_height;
      } else {
         result.tile_col_start_sb[0] = 0;
         assert(pi->pTileInfo->pMiColStarts[0] == 0);
         for (i = 0; i < result.tile_cols; ++i)
            result.tile_col_start_sb[i + 1] = result.tile_col_start_sb[i] + pi->pTileInfo->pWidthInSbsMinus1[i] + 1;
         result.tile_row_start_sb[0] = 0;
         assert(pi->pTileInfo->pMiRowStarts[0] == 0);
         for (i = 0; i < result.tile_rows; ++i)
            result.tile_row_start_sb[i + 1] = result.tile_row_start_sb[i] + pi->pTileInfo->pHeightInSbsMinus1[i] + 1;
      }
   }
   result.order_hint_bits = seq_hdr->order_hint_bits_minus_1 + 1;
   /* The VCN FW will evict references that aren't specified in
--- a/src/amd/vulkan/radv_video_enc.c
+++ b/src/amd/vulkan/radv_video_enc.c
@ -1095,7 +1095,7 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
      radv_enc_code_ue(cmd_buffer, 6);
      break;
   }
-   radv_enc_code_ue(cmd_buffer, 0x0);
+   radv_enc_code_ue(cmd_buffer, pic->pic_parameter_set_id);
   unsigned int max_frame_num_bits = sps->log2_max_frame_num_minus4 + 4;
   radv_enc_code_fixed_bits(cmd_buffer, pic->frame_num % (1 << max_frame_num_bits), max_frame_num_bits);
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
@ -759,6 +759,7 @@ error_va_map:
   ac_drm_bo_free(ws->dev, buf_handle);
 error_bo_alloc:
   ac_drm_va_range_free(va_handle);
   free(ranges);
 error_va_alloc:
--- a/src/asahi/vulkan/hk_cmd_buffer.c
+++ b/src/asahi/vulkan/hk_cmd_buffer.c
@ -376,13 +376,15 @@ hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd,
    *
    * This means that, if some earlier set gets bound in such a way that
    * it changes set_dynamic_buffer_start[s], this binding is implicitly
-    * invalidated.  Therefore, we can always look at the current value
+    * invalidated.
    * of set_dynamic_buffer_start[s] as the base of our dynamic buffer
    * range and it's only our responsibility to adjust all
    * set_dynamic_buffer_start[p] for p > s as needed.
    */
-   uint8_t dyn_buffer_start =
+   uint8_t dyn_buffer_start = 0u;
-      desc->root.set_dynamic_buffer_start[info->firstSet];
+   for (uint32_t i = 0u; i < info->firstSet; ++i) {
      const struct hk_descriptor_set_layout *set_layout =
         vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[i]);
      if (set_layout)
         dyn_buffer_start += set_layout->dynamic_buffer_count;
   }
   uint32_t next_dyn_offset = 0;
   for (uint32_t i = 0; i < info->descriptorSetCount; ++i) {
@ -427,10 +429,6 @@ hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd,
   assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS);
   assert(next_dyn_offset <= info->dynamicOffsetCount);
   for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS;
        s++)
      desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
   desc->root_dirty = true;
 }
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@ -3212,6 +3212,9 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
   struct hk_graphics_state *gfx = &cmd->state.gfx;
   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
   if (!IS_SHADER_DIRTY(VERTEX) && !IS_SHADER_DIRTY(GEOMETRY))
      return;
   /* If there's an application geometry shader, there's nothing to un/bind */
   if (gs && !gs->is_passthrough)
      return;
@ -3221,20 +3224,17 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
   uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
   bool needs_gs = xfb_outputs;
   /* If we already have a matching GS configuration, we're done */
   if ((gs != NULL) == needs_gs)
      return;
   /* If we don't need a GS but we do have a passthrough, unbind it */
-   if (gs) {
+   if (!needs_gs) {
-      assert(!needs_gs && gs->is_passthrough);
+      if (gs != NULL) {
-      hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
+         assert(gs->is_passthrough);
         hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
      }
      return;
   }
   /* Else, we need to bind a passthrough GS */
-   size_t key_size =
+   size_t key_size = hk_passthrough_gs_key_size(xfb_outputs);
      sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs);
   struct hk_passthrough_gs_key *key = alloca(key_size);
   *key = (struct hk_passthrough_gs_key){
--- a/src/asahi/vulkan/hk_cmd_meta.c
+++ b/src/asahi/vulkan/hk_cmd_meta.c
@ -1493,7 +1493,12 @@ hk_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
   uint64_t addr =
      vk_meta_buffer_address(&dev->vk, dstBuffer, dstOffset, dstRange);
-   libagx_fill(cmd, agx_1d(range / 4), AGX_BARRIER_ALL, addr, data);
+   if (util_is_aligned(addr, 16) && util_is_aligned(range, 16)) {
      libagx_fill_uint4(cmd, agx_2d(range / 16, 1), AGX_BARRIER_ALL,
                        addr, 0, data, data, data, data);
   } else {
      libagx_fill(cmd, agx_1d(range / 4), AGX_BARRIER_ALL, addr, data);
   }
 }
 VKAPI_ATTR void VKAPI_CALL
--- a/src/asahi/vulkan/hk_physical_device.c
+++ b/src/asahi/vulkan/hk_physical_device.c
@ -725,7 +725,7 @@ hk_get_device_properties(const struct agx_device *dev,
      .maxFragmentInputComponents = max_vgt_output_components,
      .maxFragmentOutputAttachments = HK_MAX_RTS,
      .maxFragmentDualSrcAttachments = 1,
-      .maxFragmentCombinedOutputResources = 16,
+      .maxFragmentCombinedOutputResources = HK_MAX_RTS + HK_MAX_DESCRIPTORS,
      .maxComputeSharedMemorySize = HK_MAX_SHARED_SIZE,
      .maxComputeWorkGroupCount = {0x7fffffff, 65535, 65535},
      .maxComputeWorkGroupInvocations = 1024,
--- a/src/asahi/vulkan/hk_shader.h
+++ b/src/asahi/vulkan/hk_shader.h
@ -387,8 +387,16 @@ struct hk_passthrough_gs_key {
   /* Decomposed primitive */
   enum mesa_prim prim;
-   /* Transform feedback info. Must add nir_xfb_info_size to get the key size */
+   /* Transform feedback info. Must use hk_passthrough_gs_key_size to get the
    * key size */
   nir_xfb_info xfb_info;
 };
 static inline size_t
 hk_passthrough_gs_key_size(uint16_t output_count)
 {
   return (sizeof(struct hk_passthrough_gs_key) - sizeof(nir_xfb_info)) +
      nir_xfb_info_size(output_count);
 }
 void hk_nir_passthrough_gs(struct nir_builder *b, const void *key_);
--- a/src/broadcom/ci/broadcom-rpi3-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi3-fails.txt
@ -765,9 +765,6 @@ spec@glsl-1.10@execution@glsl-vs-inline-explosion,Crash
 # stipple
 spec@!opengl 1.0@gl-1.0-no-op-paths,Fail
 # Bisected to b3133e250e1 ("gallium: add pipe_context::resource_release to eliminate buffer refcounting")
 spec@!opengl 1.1@longprim,Crash
 # fails on arm64, passes on armhf
 spec@arb_depth_buffer_float@depthstencil-render-miplevels 1024 s=z24_s8_d=z32f,Fail
@ -853,7 +850,6 @@ spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fai
 spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail
 spec@!opengl 1.1@texsubimage-unpack,Fail
 spec@!opengl 1.1@texwrap 2d proj,Fail
 spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- NPOT- projected,Fail
 spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- projected,Fail
@ -953,7 +949,6 @@ spec@arb_occlusion_query@occlusion_query_conform,Fail
 spec@arb_occlusion_query@occlusion_query_conform@GetObjivAval_multi2,Fail
 spec@arb_pixel_buffer_object@fbo-pbo-readpixels-small,Fail
 spec@arb_pixel_buffer_object@pbo-getteximage,Fail
 spec@arb_pixel_buffer_object@texsubimage-unpack pbo,Fail
 spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
 spec@arb_provoking_vertex@arb-provoking-vertex-render,Fail
 spec@arb_sampler_objects@sampler-objects,Fail
--- a/src/broadcom/ci/broadcom-rpi4-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi4-fails.txt
@ -861,93 +861,6 @@ ubsan-dEQP-VK.image.mutable.2d_array.r16g16b16a16_sfloat_r16g16b16a16_uint_draw_
 ubsan-dEQP-VK.image.mutable.2d_array.r32_uint_r8g8b8a8_sint_draw_copy_resolve_mutable_color_att,Fail
 ubsan-dEQP-VK.pipeline.monolithic.logic_op_na_formats.r16g16_sfloat.nand_blend,Fail
 # New failures with ES CTS 3.2.13.0
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16ui_r16ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16ui_r16ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16f.renderbuffer_to_texture2d,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_texture2d,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_texture2d,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.srgb8_alpha8_srgb8_alpha8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8i_r8i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8i_r8i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8_r8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16f.renderbuffer_to_texture2d,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_texture2d,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_texture2d,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.srgb8_alpha8_srgb8_alpha8.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.renderbuffer_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8_r8.texture2d_to_renderbuffer,Fail
 arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.renderbuffer_to_renderbuffer,Fail
 ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
 ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
 ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
 ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
 ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
 # SKQP failing tests
 ES2BlendWithNoTexture,Fail
 SRGBReadWritePixels,Fail
--- a/src/broadcom/ci/broadcom-rpi5-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi5-fails.txt
@ -701,84 +701,6 @@ dEQP-VK.binding_model.unused_invalid_descriptor.write.unused.storage_buffer,Cras
 dEQP-VK.binding_model.unused_invalid_descriptor.write.unused.uniform_buffer,Crash
 asan-dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.combined_image_sampler,Crash
 # New failures with ES CTS 3.2.13.0
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16f.renderbuffer_to_texture2d,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_texture2d,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.texture2d_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.renderbuffer_to_renderbuffer,Fail
 asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16ui_r16ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16ui_r16ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16f.renderbuffer_to_texture2d,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_texture2d,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_texture2d,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.srgb8_alpha8_srgb8_alpha8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8_r8.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8i_r8i.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8i_r8i.texture2d_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.renderbuffer_to_renderbuffer,Fail
 dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.texture2d_to_renderbuffer,Fail
 # SKQP failing tests
 ES2BlendWithNoTexture,Fail
 SRGBReadWritePixels,Fail
--- a/src/broadcom/cle/v3d_packet.xml
+++ b/src/broadcom/cle/v3d_packet.xml
@ -1,4 +1,4 @@
-<vcxml gen="3.3" min_ver="42" max_ver="71">
+<vcxml gen="4.2" min_ver="42" max_ver="71">
  <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
    <value name="NEVER" value="0"/>
--- a/src/broadcom/common/v3d_tfu.h
+++ b/src/broadcom/common/v3d_tfu.h
@ -64,12 +64,12 @@
 #define V3D71_TFU_ICFG_OTYPE_SHIFT              16
 #define V3D71_TFU_ICFG_IFORMAT_SHIFT            23
 #define V3D71_TFU_ICFG_FORMAT_RASTER             0
-#define V3D71_TFU_ICFG_FORMAT_SAND_128           1
+#define V3D71_TFU_ICFG_FORMAT_SAND               1
-#define V3D71_TFU_ICFG_FORMAT_SAND_256           2
+#define V3D71_TFU_ICFG_FORMAT_CONSTANT_COLOUR    2
-#define V3D71_TFU_ICFG_FORMAT_LINEARTILE        11
+#define V3D71_TFU_ICFG_FORMAT_LINEARTILE         3
-#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN  4
-#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
+#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN  5
-#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR        14
+#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR         6
-#define V3D71_TFU_ICFG_FORMAT_UIF_XOR           15
+#define V3D71_TFU_ICFG_FORMAT_UIF_XOR            7
 #endif
--- a/src/compiler/clc/clc.h
+++ b/src/compiler/clc/clc.h
@ -50,9 +50,12 @@ enum clc_spirv_version {
 };
 struct clc_optional_features {
   bool atomic_order_seq_cst;
   bool atomic_scope_device;
   bool extended_bit_ops;
   bool fp16;
   bool fp64;
   bool generic_address_space;
   bool int64;
   bool images;
   bool images_depth;
--- a/src/compiler/clc/clc_helpers.cpp
+++ b/src/compiler/clc/clc_helpers.cpp
@ -28,8 +28,6 @@
 #include <sstream>
 #include <mutex>
 #include "util/ralloc.h"
 #include "util/set.h"
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/IR/DiagnosticPrinter.h>
 #include <llvm/IR/DiagnosticInfo.h>
@ -68,7 +66,17 @@
 #include <llvm/Support/VirtualFileSystem.h>
 #endif
 #if LLVM_VERSION_MAJOR >= 22
 #include <clang/Options/OptionUtils.h>
 #endif
 /* We have to include our own headers after LLVM/clang as they seem to use
 * `UNUSED` within enum definitions:
 * https://github.com/llvm/llvm-project/blob/ea443eeb2ab8ed49ffb783c2025fed6629a36f10/clang/include/clang/Basic/OffloadArch.h#L19
 */
 #include "util/macros.h"
 #include "util/ralloc.h"
 #include "util/set.h"
 #include "util/u_dl.h"
 #include "glsl_types.h"
@ -915,7 +923,9 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
   // GetResourcePath is a way to retrieve the actual libclang resource dir based on a given binary
   // or library.
   auto tmp_res_path =
-#if LLVM_VERSION_MAJOR >= 20
+#if LLVM_VERSION_MAJOR >= 22
      clang::GetResourcesPath(std::string(clang_path));
 #elif LLVM_VERSION_MAJOR >= 20
      Driver::GetResourcesPath(std::string(clang_path));
 #else
      Driver::GetResourcesPath(std::string(clang_path), CLANG_RESOURCE_DIR);
@ -959,6 +969,12 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
   c->getPreprocessorOpts().addMacroDef("cl_khr_expect_assume=1");
   bool needs_opencl_c_h = false;
   if (args->features.atomic_order_seq_cst) {
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_atomic_order_seq_cst");
   }
   if (args->features.atomic_scope_device) {
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_atomic_scope_device");
   }
   if (args->features.extended_bit_ops) {
      c->getPreprocessorOpts().addMacroDef("cl_khr_extended_bit_ops=1");
   }
@ -969,6 +985,9 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+cl_khr_fp64");
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_fp64");
   }
   if (args->features.generic_address_space) {
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_generic_address_space");
   }
   if (args->features.int64) {
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+cles_khr_int64");
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_int64");
--- a/src/compiler/clc/mesa_clc.c
+++ b/src/compiler/clc/mesa_clc.c
@ -134,6 +134,11 @@ main(int argc, char **argv)
         .args = util_dynarray_begin(&clang_args),
         .num_args = util_dynarray_num_elements(&clang_args, char *),
         .c_compatible = true,
         .features = {
            .atomic_order_seq_cst = true,
            .atomic_scope_device = true,
            .generic_address_space = true,
         },
      };
      /* Enable all features, we don't know the target here and it is the
--- a/src/compiler/clc/nir_load_libclc.c
+++ b/src/compiler/clc/nir_load_libclc.c
@ -263,7 +263,7 @@ libclc_add_generic_variants(nir_shader *shader)
      if (strstr(func->name, "async_work_group_strided_copy"))
         continue;
-      char *U3AS1 = strstr(func->name, "U3AS1");
+      const char *U3AS1 = strstr(func->name, "U3AS1");
      if (U3AS1 == NULL)
         continue;
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@ -3379,19 +3379,21 @@ static void
 apply_explicit_location(const struct ast_type_qualifier *qual,
                        ir_variable *var,
                        struct _mesa_glsl_parse_state *state,
-                        YYLTYPE *loc)
+                        YYLTYPE *loc, bool force_explict_uniform_loc_zero)
 {
   bool fail = false;
-   unsigned qual_location;
+   unsigned qual_location = 0;
   if (!process_qualifier_constant(state, loc, "location", qual->location,
-                                   &qual_location)) {
+                                   &qual_location) &&
       !force_explict_uniform_loc_zero) {
      return;
   }
   /* Checks for GL_ARB_explicit_uniform_location. */
   if (qual->flags.q.uniform) {
-      if (!state->check_explicit_uniform_location_allowed(loc, var))
+      if (!force_explict_uniform_loc_zero &&
          !state->check_explicit_uniform_location_allowed(loc, var))
         return;
      const struct gl_constants *consts = state->consts;
@ -3919,8 +3921,13 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
                       qual_string);
   }
-   if (qual->flags.q.explicit_location) {
+   bool force_explict_uniform_loc_zero =
-      apply_explicit_location(qual, var, state, loc);
+      state->ctx->Const.ForceExplicitUniformLocZero && qual->flags.q.uniform &&
      strcmp(state->ctx->Const.ForceExplicitUniformLocZero, var->name) == 0;
   if (qual->flags.q.explicit_location || force_explict_uniform_loc_zero) {
      apply_explicit_location(qual, var, state, loc,
                              force_explict_uniform_loc_zero);
      if (qual->flags.q.explicit_component) {
         unsigned qual_component;
@ -7667,6 +7674,7 @@ ast_process_struct_or_iface_block_members(ir_exec_list *instructions,
       * embedded structures in 1.10 only.
       */
      if (state->language_version != 110 &&
          !state->allow_glsl_embedded_structure_declarations &&
          decl_list->type->specifier->structure != NULL)
         _mesa_glsl_error(&loc, state,
                          "embedded structure declarations are not allowed");
--- a/src/compiler/glsl/gl_nir_linker.c
+++ b/src/compiler/glsl/gl_nir_linker.c
@ -1684,12 +1684,27 @@ cross_validate_globals(void *mem_ctx, const struct gl_constants *consts,
                     existing->data.mode == nir_var_mem_ssbo &&
                     existing->data.from_ssbo_unsized_array &&
                     glsl_get_gl_type(var->type) == glsl_get_gl_type(existing->type))) {
-                  linker_error(prog, "%s `%s' declared as type "
+
-                                 "`%s' and type `%s'\n",
+                  /* Relax precision matching on unused uniforms for early ES shaders */
-                                 gl_nir_mode_string(var),
+                  if (prog->IsES && !var->interface_type &&
-                                 var->name, glsl_get_type_name(var->type),
+                      !(existing->data.used && var->data.used) &&
-                                 glsl_get_type_name(existing->type));
+                      glsl_base_type_is_integer(glsl_get_gl_type(var->type)) == glsl_base_type_is_integer(glsl_get_gl_type(existing->type)) &&
-                  return;
+                      glsl_base_type_is_float(glsl_get_gl_type(var->type)) == glsl_base_type_is_float(glsl_get_gl_type(existing->type)) &&
                      prog->GLSL_Version < 300) {
                     linker_warning(prog, "%s `%s' declared as type "
                                    "`%s' and type `%s'\n",
                                    gl_nir_mode_string(var),
                                    var->name, glsl_get_type_name(var->type),
                                    glsl_get_type_name(existing->type));
                  } else {
                     linker_error(prog, "%s `%s' declared as type "
                                    "`%s' and type `%s'\n",
                                    gl_nir_mode_string(var),
                                    var->name, glsl_get_type_name(var->type),
                                    glsl_get_type_name(existing->type));
                     return;
                  }
               }
            }
         }
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@ -329,6 +329,8 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
      ctx->Const.AllowVertexTextureBias;
   this->allow_glsl_120_subset_in_110 =
      ctx->Const.AllowGLSL120SubsetIn110;
   this->allow_glsl_embedded_structure_declarations =
      ctx->Const.AllowGLSLEmbeddedStructureDeclarations;
   this->allow_builtin_variable_redeclaration =
      ctx->Const.AllowGLSLBuiltinVariableRedeclaration;
   this->ignore_write_to_readonly_var =
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@ -1023,6 +1023,7 @@ struct _mesa_glsl_parse_state {
   char *alias_shader_extension;
   bool allow_vertex_texture_bias;
   bool allow_glsl_120_subset_in_110;
   bool allow_glsl_embedded_structure_declarations;
   bool allow_builtin_variable_redeclaration;
   bool ignore_write_to_readonly_var;
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@ -676,6 +676,14 @@ glsl_type_is_e5m2(const glsl_type *t)
   return t->base_type == GLSL_TYPE_FLOAT_E5M2;
 }
 static inline bool
 glsl_type_is_nonnative_float(const glsl_type *t)
 {
   return t->base_type == GLSL_TYPE_BFLOAT16 ||
          t->base_type == GLSL_TYPE_FLOAT_E4M3FN ||
          t->base_type == GLSL_TYPE_FLOAT_E5M2;
 }
 static inline bool
 glsl_type_is_int_16_32_64(const glsl_type *t)
 {
--- a/Show more
+++ b/Show more