VERSION: bump for 26.0.2

docs: add release notes for 26.0.2
egl/device: fix the fix for explicit sw rejection in non-sw EGL_PLATFORM=device
2026-06-11 18:28:31 +02:00 · 2026-03-12 12:56:33 +01:00 · 2026-03-12 12:56:33 +01:00 · 2026-03-11 23:21:12 +01:00 · 2026-03-11 23:21:12 +01:00 · 2026-03-11 23:21:12 +01:00
374 changed files with 33306 additions and 4708 deletions
--- a/.ci-farms-disabled/lima
+++ b/.ci-farms-disabled/lima
--- a/.clang-format
+++ b/.clang-format
--- a/.gitlab-ci/build/gitlab-ci.yml
+++ b/.gitlab-ci/build/gitlab-ci.yml
@ -774,7 +774,7 @@ debian-riscv64:
 # While s390 is dead, s390x is very much alive, and one of the last major
 # big-endian platforms, so it provides useful coverage.
 # In case of issues with this job, contact @ajax
-debian-s390x:
+.debian-s390x:
  extends:
    - .meson-cross
    - .use-debian/s390x_build
@ -789,7 +789,7 @@ debian-s390x:
    DRI_LOADERS:
      -D glvnd=disabled

-debian-ppc64el:
+.debian-ppc64el:
  extends:
    - .meson-cross
    - .use-debian/ppc64el_build
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@ -1 +1 @@
-26.0.0-devel
+26.0.2
--- a/bin/gen_release_notes.py
+++ b/bin/gen_release_notes.py
@ -385,5 +385,5 @@ async def main() -> None:


 if __name__ == "__main__":
-    loop = asyncio.get_event_loop()
+    loop = asyncio.new_event_loop()
    loop.run_until_complete(main())
--- a/bin/pick-ui.py
+++ b/bin/pick-ui.py
@ -27,7 +27,9 @@ from pick.ui import UI, PALETTE

 if __name__ == "__main__":
    u = UI()
-    evl = urwid.AsyncioEventLoop(loop=asyncio.new_event_loop())
+    asyncio_loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(asyncio_loop)
+    evl = urwid.AsyncioEventLoop(loop=asyncio_loop)
    loop = urwid.MainLoop(u.render(), PALETTE, event_loop=evl, handle_mouse=False)
    u.mainloop = loop
    loop.run()
--- a/bin/pick/core.py
+++ b/bin/pick/core.py
@ -52,7 +52,7 @@ IS_FIX = re.compile(r'^\s*fixes:\s*([a-f0-9]{6,40})', flags=re.MULTILINE | re.IG
 IS_CC = re.compile(r'^\s*cc:\s*["\']?([0-9]{2}\.[0-9])?["\']?\s*["\']?([0-9]{2}\.[0-9])?["\']?\s*\<?mesa-stable',
                   flags=re.MULTILINE | re.IGNORECASE)
 IS_REVERT = re.compile(r'This reverts commit ([0-9a-f]{40})')
-IS_BACKPORT = re.compile(r'^\s*backport-to:\s*(\d{2}\.\d),?\s*(\d{2}\.\d)?',
+IS_BACKPORT = re.compile(r'^\s*backport-to:\s*(?:(\d{2}\.\d),?\s*(\d{2}\.\d)?|(\*))',
                         flags=re.MULTILINE | re.IGNORECASE)

 # XXX: hack
@ -295,7 +295,7 @@ async def resolve_nomination(commit: 'Commit', version: str) -> 'Commit':

    if backport_to := IS_BACKPORT.findall(commit_message):
        for match in backport_to:
-            if any(Version(version) >= Version(backport_version)
+            if any(backport_version == '*' or Version(version) >= Version(backport_version)
                   for backport_version in match if backport_version != ''):
                commit.nominated = True
                commit.nomination_type = NominationType.BACKPORT
--- a/bin/pick/core_test.py
+++ b/bin/pick/core_test.py
@ -263,7 +263,7 @@ class TestRE:
            """)

            backport_to = core.IS_BACKPORT.findall(message)
-            assert backport_to == [('19.2', '')]
+            assert backport_to == [('19.2', '', '')]

        def test_multiple_release_space(self):
            """Tests commit with more than one branch specified"""
@ -278,7 +278,7 @@ class TestRE:
            """)

            backport_to = core.IS_BACKPORT.findall(message)
-            assert backport_to == [('19.1', '19.2')]
+            assert backport_to == [('19.1', '19.2', '')]

        def test_multiple_release_comma(self):
            """Tests commit with more than one branch specified"""
@ -293,7 +293,7 @@ class TestRE:
            """)

            backport_to = core.IS_BACKPORT.findall(message)
-            assert backport_to == [('19.1', '19.2')]
+            assert backport_to == [('19.1', '19.2', '')]

        def test_multiple_release_lines(self):
            """Tests commit with more than one branch specified in mulitple tags"""
@ -305,7 +305,7 @@ class TestRE:
            """)

            backport_to = core.IS_BACKPORT.findall(message)
-            assert backport_to == [('19.0', ''), ('19.1', '19.2')]
+            assert backport_to == [('19.0', '', ''), ('19.1', '19.2', '')]


 class TestResolveNomination:
@ -405,6 +405,17 @@ class TestResolveNomination:
        assert c.nominated
        assert c.nomination_type is core.NominationType.BACKPORT

+    @pytest.mark.asyncio
+    async def test_backport_all_is_nominated(self):
+        s = self.FakeSubprocess(b'Backport-to: *')
+        c = core.Commit('abcdef1234567890', 'a commit')
+
+        with mock.patch('bin.pick.core.asyncio.create_subprocess_exec', s.mock):
+            await core.resolve_nomination(c, '0.0')
+
+        assert c.nominated
+        assert c.nomination_type is core.NominationType.BACKPORT
+
    @pytest.mark.asyncio
    async def test_backport_is_nominated_after(self):
        s = self.FakeSubprocess(b'Backport-to: 16.2')
--- a/bin/pick/requirements.txt
+++ b/bin/pick/requirements.txt
@ -1,3 +1,3 @@
-attrs==23.1.0
-packaging==25.0
-urwid==2.1.2
+attrs==25.4.0
+packaging==26.0
+urwid==3.0.3
--- a/bin/pick/ui.py
+++ b/bin/pick/ui.py
@ -224,6 +224,7 @@ class UI:
            if commit.nominated and commit.resolution is core.Resolution.UNRESOLVED:
                b = urwid.AttrMap(CommitWidget(self, commit), None, focus_map='reversed')
                self.commit_list.append(b)
+        self.mainloop.draw_screen()
        self.save()

    async def feedback(self, text: str) -> None:
@ -236,6 +237,7 @@ class UI:
            if c.base_widget is commit:
                del self.commit_list[i]
                break
+        self.mainloop.draw_screen()

    def save(self):
        core.save(itertools.chain(self.new_commits, self.previous_commits))
@ -246,6 +248,7 @@ class UI:

        def reset_cb(_) -> None:
            self.mainloop.widget = o
+            self.mainloop.draw_screen()

        async def apply_cb(edit: urwid.Edit) -> None:
            text: str = edit.get_edit_text()
@ -263,6 +266,7 @@ class UI:
                raise RuntimeError(f"Couldn't find {sha}")

            await commit.apply(self)
+            self.mainloop.draw_screen()

        q = urwid.Edit("Commit sha\n")
        ok_btn = urwid.Button('Ok')
@ -279,12 +283,14 @@ class UI:
        self.mainloop.widget = urwid.Overlay(
            urwid.Filler(box), o, 'center', ('relative', 50), 'middle', ('relative', 50)
        )
+        self.mainloop.draw_screen()

    def chp_failed(self, commit: 'CommitWidget', err: str) -> None:
        o = self.mainloop.widget

        def reset_cb(_) -> None:
            self.mainloop.widget = o
+            self.mainloop.draw_screen()

        t = urwid.Text(textwrap.dedent(f"""
            Failed to apply {commit.commit.sha} {commit.commit.description} with the following error:
@ -313,3 +319,4 @@ class UI:
        self.mainloop.widget = urwid.Overlay(
            urwid.Filler(box), o, 'center', ('relative', 50), 'middle', ('relative', 50)
        )
+        self.mainloop.draw_screen()
--- a/docs/relnotes.rst
+++ b/docs/relnotes.rst
@ -3,6 +3,9 @@ Release Notes

 The release notes summarize what's new or changed in each Mesa release.

+-  :doc:`26.0.2 release notes <relnotes/26.0.2>`
+-  :doc:`26.0.1 release notes <relnotes/26.0.1>`
+-  :doc:`26.0.0 release notes <relnotes/26.0.0>`
 -  :doc:`25.3.3 release notes <relnotes/25.3.3>`
 -  :doc:`25.3.2 release notes <relnotes/25.3.2>`
 -  :doc:`25.2.8 release notes <relnotes/25.2.8>`
@ -473,6 +476,9 @@ The release notes summarize what's new or changed in each Mesa release.
   :maxdepth: 1
   :hidden:

+   26.0.2 <relnotes/26.0.2>
+   26.0.1 <relnotes/26.0.1>
+   26.0.0 <relnotes/26.0.0>
   25.3.3 <relnotes/25.3.3>
   25.3.2 <relnotes/25.3.2>
   25.2.8 <relnotes/25.2.8>
--- a/docs/relnotes/26.0.0.rst
+++ b/docs/relnotes/26.0.0.rst
--- a/docs/relnotes/26.0.1.rst
+++ b/docs/relnotes/26.0.1.rst
@ -0,0 +1,247 @@
+Mesa 26.0.1 Release Notes / 2026-02-25
+======================================
+
+Mesa 26.0.1 is a bug fix release which fixes bugs found since the 26.0.0 release.
+
+Mesa 26.0.1 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 26.0.1 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    SHA256: bb5104f9f9a46c9b5175c24e601e0ef1ab44ce2d0fdbe81548b59adc8b385dcc  mesa-26.0.1.tar.xz
+    SHA512: d47072257035acfa8a5594c0cda831b4e5178169dea8a06c6657268a441e32271f8798486e837cea23f35ce3f0b4b9520a4ea4ed26b0e1267b02da4c649bc9f9  mesa-26.0.1.tar.xz
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- Missing Haswell case after a097a3d214eda7fb7b9ff63176754b7260e09e03 leads to bogus assert in intel_perf_mdapi.c
+- Question: Does building Lavapipe on Windows require building "microsoft-experimental" as well?
+- [ANV]: Regression in dxvk Greedfall
+- [ANV][BMG] Building Mesa with Clang causes Missing Skin Textures in UE games - Tekken 8
+- [ANV][DG2][Regression]: Flickering water "boxes" in Civilization VII
+- [RADV] Killer7 has a blue tint with RDNA3/4
+- [bisected] Xe3 regression with piglit tess/barrier-patch.shader_test after cmod prop change
+- [radeonsi] Regression: GL_FEEDBACK returns 0.0 for X-coordinates (Legacy GL)
+- anv, bisected: Genshin Impact wrong shadows, flickering grass
+- turnip: llama.cpp: Running test-backend-ops results in segmentation fault
+- venus crashes in vn_CreateDevice() with latest mesa/main [bisected]
+
+
+Changes
+-------
+
+Aitor Camacho (7):
+
+- wsi/metal: Expose additional color spaces if instance extension enabled
+- kk: Fill pipelineUUID
+- kk: Fix shader uint32_t value serialization
+- kk: Correctly release pipeline handles at shader destroy
+- kk: Fix compute pipeline cache
+- kk: Move gfx pipeline data to the info struct within kk_shader
+- kk: Fix graphics pipeline serialization
+
+Alyssa Rosenzweig (1):
+
+- brw: drop buggy SLM optimization
+
+Anna Maniscalco (1):
+
+- freedreno/common: set has_astc_hdr true for a7xx targets
+
+Benjamin Otte (1):
+
+- lavapipe: Fix features for nonsubsampled ycbcr formats
+
+Daniel Schürmann (1):
+
+- nir/clone: Fix cloning indirect call instructions
+
+Danylo Piliaiev (1):
+
+- ir3: Align TCS per-patch output to 64 bytes to prevent stale reads
+
+Emma Anholt (1):
+
+- ir3/ra: Fix DOUBLE_ONLY limit pressure computation.
+
+Eric Engestrom (5):
+
+- docs: add sha sum for 26.0.0
+- .pick_status.json: Update to 03d2cc2b2ae5341409ee1fab74e98134a6df0511
+- bin/gen_release_notes: fix support for python 3.14
+- pick-ui: add \`Backport-to: \*` as a synonym to \`Cc: mesa-stable`
+- .pick_status.json: Mark 7dd7731ac710b0c7213f6bb466b33f62eca80604 as denominated
+
+Faith Ekstrand (6):
+
+- pan/clear: Stop packing undefined bits in colors
+- nir/gather_info: Add support for panfrost tile load/store intrinsics
+- panvk: Create both Z/S descriptors, even for separate Z/S
+- panvk/preload: Stop assuming 32 registers
+- panvk/jm: Refactor BeginRendering()
+- panvk: Also load output attachments with LOAD_OP_NONE+STORE_OP_NONE
+
+Frank Binns (2):
+
+- pvr/ci: move some timing out tests from fails to skips
+- pvr: Fix alloc callbacks usage when freeing frame buffers
+
+Ian Romanick (8):
+
+- spirv: Use STACK_ARRAY instead of NIR_VLA
+- nir: Use STACK_ARRAY instead of NIR_VLA
+- brw: Call nir_opt_algebraic_late in brw_nir_create_raygen_trampoline
+- brw: Call nir_opt_algebraic_late later in brw_postprocess_nir_opts
+- elk: Call nir_opt_algebraic_late in elk_postprocess_nir
+- brw/cmod: Don't propagate from CMP to ADD if there is a write between
+- elk/cmod: Don't propagate from CMP to possible Inf + (-Inf)
+- elk/cmod: Don't propagate from CMP to ADD if there is a write between
+
+Janne Grunau (3):
+
+- asahi: Use GPU for buffer copies in resource_copy_region()
+- asahi: Implement clear_buffer using libagx_fill*
+- hk: Use aligned vector fill in hk_CmdFillBuffer if possible
+
+Jarred Davies (2):
+
+- pvr: Fix allocating the required scratch buffer space for tile buffers
+- pvr: Add missing support for tile buffers to SPM EOT programs
+
+Jesse Natalie (1):
+
+- meson: Include DirectX-Headers dependency for all VK Windows builds
+
+Jianxun Zhang (1):
+
+- anv: Limit modifier disabling workaround to specific GTK versions
+
+José Roberto de Souza (1):
+
+- intel/perf: Add HSW verx10 to intel_perf_query_result_write_mdapi()
+
+Juston Li (1):
+
+- anv: set missing protected bit for protected depth/stencil surfaces
+
+Konstantin Seurer (2):
+
+- radv: Fix setting the viewport for depth stencil FS resolves
+- vulkan/cmd_queue: Fixup stride for multi draws
+
+Lars-Ivar Hesselberg Simonsen (2):
+
+- panvk: Fix dcd_flags1 dirty bit
+- pan/genxml/v13: Fix HSR Prepass typo
+
+Leon Perianu (1):
+
+- pvr: fix format table properties duplicate
+
+Lionel Landwerlin (8):
+
+- anv: flush render caches on first pipeline select
+- anv: fix nested command buffer relocations
+- anv: add missing constant cache invalidation for descriptor buffers
+- isl: fix 32bit math with 4GB buffer size
+- anv: apply the same ccs disabling for Xe3 than Xe2
+- anv: disable ccs modifier reporting when ccs modifiers are disabled
+- anv: dirty descriptors after blorp operations
+- anv: remove snprintf for aux op transition
+
+Mary Guillemard (1):
+
+- hk: Fix crash in hk_handle_passthrough_gs
+
+Matt Turner (4):
+
+- brw/cse: fix \`operands_match` corrupting non-IMM register data
+- brw/cse: use copies in \`operands_match` instead of in-place modification
+- elk/cse: fix \`operands_match` corrupting non-IMM register data
+- elk/cse: use copies in \`operands_match` instead of in-place modification
+
+Mike Blumenkrantz (2):
+
+- zink: fix broken compiler assert
+- zink: only do pre-sync transfer barrier after a renderpass
+
+Natalie Vock (3):
+
+- radv/rt: Only use ds_bvh_stack_rtn if the stack base is possible to encode
+- radv: Initialize nir_lower_io_to_scalar progress variable
+- radv/nir: Correctly handle workgroup sizes not aligned to 32
+
+Nick Hamilton (5):
+
+- pvr: Fix incorrect subpass merging optimisation
+- pvr: Rename pvr_render_input_attachment
+- pvr: Add missing support for preserve attachments
+- pvr: Update CI fails list after render pass fixes
+- pvr: Add support for fragment pass through shader
+
+Olivia Lee (1):
+
+- hk: fix passthrough GS key invalidation
+
+Pavel Ondračka (2):
+
+- r300: align macro-tiled stride-addressed textures in X
+- mesa: implement FRAMEBUFFER_RENDERABLE internalformat query
+
+Rhys Perry (3):
+
+- aco: fix gfx6-8 store_scratch() with function calls
+- aco: reset all vgpr_used_by_vmem\_ in resolve_all_gfx11
+- aco: resolve hazards before calls
+
+Robert Mader (1):
+
+- lavapipe: enable dmabuf import for planar drm formats
+
+Ryan Zhang (1):
+
+- panvk: guard against NULL pointers to avoid crash
+
+Samuel Pitoiset (5):
+
+- ac,radv,radeonsi: use correct swizzle/pitch for depth-only images with SDMA
+- radv: fix potential corruption after FMASK decompression on GFX6-8
+- radv/meta: fix depth/stencil resolves with different regions
+- ac/nir: fix writemask for dual source blending on GFX11+
+- radv: fix potential GPU hangs with secondaries on transfer queue
+
+Tapani Pälli (1):
+
+- util: bring back fix to avoid strict aliasing bugs in xxhash
+
+Timothy Arceri (2):
+
+- mesa: add _mesa_lookup_state_param_idx() helper
+- st/glsl_to_nir: make sure the variant has the correct locations set
+
+Wei Hao (1):
+
+- radeonsi: fix threaded shader compilation finishing after context is destroyed
+
+Yiwei Zhang (2):
+
+- venus: workaround a gcc-15 dead store elimination (DSE) bug
+- venus: sync protocol for strict aliasing compliance
--- a/docs/relnotes/26.0.2.rst
+++ b/docs/relnotes/26.0.2.rst
@ -0,0 +1,238 @@
+Mesa 26.0.2 Release Notes / 2026-03-12
+======================================
+
+Mesa 26.0.2 is a bug fix release which fixes bugs found since the 26.0.1 release.
+
+Mesa 26.0.2 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 26.0.2 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    TBD.
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- 26.0.1 fails to build: \`create_context.c: error: 'struct glx_screen' has no member named 'frontend_screen'`
+- A770: Counter-Strike 2 visual glitches (regression)
+- Bisected regression: Assertion texObj->pt == view->texture failed.
+- Kodi regression with panthor >= 1.7 after updating to Linux 7.0-rc1
+- MDK2 HD (opengl) has most elements rendered as black
+- Mesa 25.3 amdgpu memory issue
+- OpenGL 4.1 VRAM Memory Leak with setting uniform variables
+- Panfrost Bifrost compiler assertion failure: wrong vectorization in bi_alu_src_index (Mesa 26.0.0)
+- RADV: RDNA4 visual corruption in DX11 (DXVK) – Mafia III character model glitches, AMDVLK renders correctly (9070XT)
+- [radeonsi] Regression: GL_FEEDBACK returns 0.0 for X-coordinates (Legacy GL)
+- glsl: spec\@glsl-es-1.00\@linker\@glsl-mismatched-uniform-precision-unused broken
+- ir3: ir3_get_predicate() vs &ctx->build
+- r300 , regression , bisected : Glitches with Sauerbraten
+- r300: HiZ related dEQP failures
+
+
+Changes
+-------
+
+Anna Maniscalco (1):
+
+- zink: don't care about generated gs output primitive
+
+Benjamin Cheng (1):
+
+- radeonsi/vcn: Use full pitch for pre-encode input
+
+Boris Brezillon (1):
+
+- pan/kmod: Allow mmap() on foreign buffers
+
+Caio Oliveira (4):
+
+- spirv: Refactor ALU opcode translation to take bit sizes
+- spirv: Pull constant source fixup to the existing loop
+- spirv: Fix spec constant to handle Select for non-native floats
+- nir: Fix constant folding for iadd_sat
+
+Christoph Pillmayer (2):
+
+- pan/bi: Fix coupling spill placement
+- pan/bi: Move FAUs to memory for memory phis
+
+Connor Abbott (4):
+
+- tu: Use HW offset 0 in 3d loads/clears with FDM
+- ir3: Fix constlen trimming when more than one stage is trimmed
+- tu: Set polygon mode when blitting
+- tu: Fix setting will_be_resolved with MSRTSS
+
+Danylo Piliaiev (2):
+
+- tu: Store gmem attachments after custom resolve in dyn RP
+- tu: Don't read .patch_input_gmem of unused attachment
+
+David Rosca (1):
+
+- vl: Also disable MPEG2 Main profile when mpeg12 decode is disabled
+
+Eric Engestrom (3):
+
+- docs: add sha sum for 26.0.1
+- fixup! docs: add release notes for 26.0.1
+- .pick_status.json: Update to 73dba1e15173ff6109925de9615f9d9f5cccc2be
+
+Eric R. Smith (1):
+
+- pco: fix a typo in the check for optimization looping
+
+Erik Faye-Lund (1):
+
+- gallium/dri: set LIBVA_DRIVERS_PATH in devenv
+
+Faith Ekstrand (3):
+
+- etnaviv: Call lower_bool_to_int32 not to_bitsize
+- nir/lower_bool_to_bitsize: Make all bN_csel sources match
+- pan/bi: Be more careful about bit sizes in b2f lowering
+
+Georg Lehmann (3):
+
+- ci: disable debian-ppc64el and debian-s390x
+- aco/insert_fp_mode: don't skip setting round for fract
+- nir/opt_algebraic: fix frsq clamp pattern
+
+Ian Romanick (5):
+
+- brw: Don't mark_invalid in update_for_reads for non-VGRF destination
+- brw: Use brw_reg_is_arf in update_for_reads
+- brw: Also check for ADDRESS file in update_for_reads
+- brw/algebraic: Don't optimize SEL.L.SAT or SEL.G.SAT
+- elk/algebraic: Don't optimize SEL.L.SAT or SEL.G.SAT
+
+Icenowy Zheng (1):
+
+- pvr: only specially handle gfx subcmd for BeginQuery
+
+Iván Briano (1):
+
+- anv: don't try to fast clear D/S with multiview
+
+Jesse Natalie (1):
+
+- d3d12: Fix importing external resources
+
+Job Noorman (2):
+
+- ir3: update context builder after ir3_get_predicate
+- ir3: don't predicate vote_all/vote_any
+
+Jose Maria Casanova Crespo (3):
+
+- v3d: flush write jobs before BO replacement in DISCARD_WHOLE path
+- vc4: flush write jobs before BO replacement in DISCARD_WHOLE path
+- v3d: reject fast TLB blit when RT formats don't match
+
+Karol Herbst (2):
+
+- nir: fix nir_alu_type_range_contains_type_range for fp16 to int
+- nir: fix nir_round_int_to_float for fp16
+
+Lionel Landwerlin (2):
+
+- anv: add missing handling for attachment locations in secondaries
+- anv: dirty all push constant stages in simple shader
+
+Lucas Fryzek (5):
+
+- drisw: Properly mark shmid as -1 when alloc fails
+- x11: Add helper util to check for xshm support
+- egl/dri: Check that xshm can be attached
+- glx: Check that xshm can be attached
+- vulkan/wsi: Check that xshm can be attached
+
+Luigi Santivetti (1):
+
+- zink: fix format conversion logic for the alpha emulation case
+
+Marek Olšák (1):
+
+- ac: set the correct number of Z planes for ALLOW_EXPCLEAR
+
+Mary Guillemard (1):
+
+- vulkan: Do not override the shader_flags in case of no task shader
+
+Mel Henning (1):
+
+- driconf: force_vk_vendor on No Man's Sky + NVK
+
+Mike Blumenkrantz (4):
+
+- zink: add TRANSFER_WRITE -> HOST_READ sync to end of batch
+- st/bitmap: only release YUV samplerviews
+- radv: fix multiview fast clears
+- egl/device: fix the fix for explicit sw rejection in non-sw EGL_PLATFORM=device
+
+Patrick Lerda (1):
+
+- r600: fix cs atomic operations when the shader is called multiple times
+
+Pavel Ondračka (3):
+
+- r300: copy target when merging alpha output instruction
+- r300: disable HiZ for PIPE_FUNC_ALWAYS
+- r300: disable clip-discard watermark for triangles
+
+Pierre-Eric Pelloux-Prayer (2):
+
+- frontends/va: fix undefined ref error
+- mesa: don't wraparound st_context::work_counter
+
+Rhys Perry (2):
+
+- aco: perform dce for blocks skipped for process_block()
+- nir/range_analysis: set deleted key
+
+Sagar Ghuge (1):
+
+- anv: Fix Wa_14021821874, Wa_14018813551, Wa_14026600921
+
+Samuel Pitoiset (4):
+
+- radv: fix copying images with different swizzle modes on SDMA7
+- radv: fix a GPU hang with PS epilogs and secondary command buffers
+- radv: fix local invocation index for mesh/task and quad derivatives on GFX12
+- radv: fix missing L2 cache invalidation with streamout on GFX12
+
+Tapani Pälli (2):
+
+- intel/dev: update mesa_defs.json from workaround database
+- anv: add handling for Wa_14026600921
+
+Timothy Arceri (5):
+
+- glsl: relax precision matching on unused uniforms ES
+- glsl: add workaround for MDK2 HD
+- mesa/st: use same path for setting state ref locations
+- st/glsl_to_nir: update state var locations earlier
+- glx: guard glx_screen frontend_screen member
+
+Yiwei Zhang (2):
+
+- pan: fix to not clear out of bitset range
+- lvp: avoid advertising dmabuf support for kms_swrast
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@ -1,32 +0,0 @@
-VK_KHR_relaxed_block_layout on pvr
-VK_KHR_storage_buffer_storage_class on pvr
-VK_EXT_external_memory_acquire_unmodified on panvk
-VK_EXT_discard_rectangles on NVK
-VK_KHR_present_id on HoneyKrisp
-VK_KHR_present_id2 on HoneyKrisp
-VK_KHR_present_wait on HoneyKrisp
-VK_KHR_present_wait2 on HoneyKrisp
-VK_KHR_maintenance10 on ANV, NVK, RADV
-VK_EXT_shader_uniform_buffer_unsized_array on ANV, HK, NVK, RADV
-VK_EXT_device_memory_report on panvk
-VK_VALVE_video_encode_rgb_conversion on radv
-VK_EXT_custom_resolve on RADV
-GL_EXT_shader_pixel_local_storage on Panfrost v6+
-VK_EXT_image_drm_format_modifier on panvk/v7
-VK_KHR_sampler_ycbcr_conversion on panvk/v7
-sparseResidencyImage2D on panvk v10+
-sparseResidencyStandard2DBlockShape on panvk v10+
-VK_KHR_surface_maintenance1 promotion everywhere EXT is exposed
-VK_KHR_swapchain_maintenance1 promotion everywhere EXT is exposed
-VK_KHR_dynamic_rendering on PowerVR
-VK_EXT_multisampled_render_to_single_sampled on panvk
-VK_KHR_pipeline_binary on HoneyKrisp
-VK_KHR_incremental_present on pvr
-VK_KHR_xcb_surface on pvr
-VK_KHR_xlib_surface on pvr
-VK_KHR_robustness2 on panvk v10+
-VK_KHR_robustness2 on HoneyKrisp
-VK_KHR_robustness2 on hasvk
-VK_KHR_robustness2 on NVK
-VK_KHR_robustness2 on Turnip
-VK_KHR_robustness2 on lavapipe
--- a/docs/submittingpatches.rst
+++ b/docs/submittingpatches.rst
@ -197,6 +197,9 @@ following example::
 This will backport the commit to the 21.0 branch, as well as any more recent
 stable branch. Multiple ``Backport-to:`` lines are allowed, but only the
 lowest number mentioned actually matters, so for clarity, please only use one.
+You can also use the special ``Backport-to: *`` which will nominate the commit
+to be backported to every active stable branch, making it a synonym to the ``Cc:
+mesa-stable`` below.

 The last option is deprecated and mostly here for historical reasons
 dating back to when patch submission was done via emails: using a ``Cc:``
--- a/meson.build
+++ b/meson.build
@ -642,7 +642,7 @@ if with_dri
 endif

 dep_dxheaders = null_dep
-if with_gallium_d3d12 or with_microsoft_clc or with_microsoft_vk or with_gfxstream_vk and host_machine.system() == 'windows'
+if with_gallium_d3d12 or with_microsoft_clc or with_microsoft_vk or (with_any_vk and host_machine.system() == 'windows')
  dep_dxheaders = dependency('directx-headers', required : false)
  if not dep_dxheaders.found()
    dep_dxheaders = dependency('DirectX-Headers',
@ -1931,7 +1931,6 @@ dep_spirv_tools = dependency(
  'SPIRV-Tools',
  required : with_spirv_tools,
  version : '>= 2024.1',
-  static : host_machine.system() == 'darwin',
 )
 if dep_spirv_tools.found()
  pre_args += '-DHAVE_SPIRV_TOOLS'
--- a/src/amd/ci/radeonsi-mendocino-fails.txt
+++ b/src/amd/ci/radeonsi-mendocino-fails.txt
@ -401,7 +401,6 @@ spec@egl 1.4@eglterminate then unbind context,Fail
 spec@egl_khr_surfaceless_context@viewport,Fail
 spec@egl_mesa_configless_context@basic,Fail
 spec@ext_external_objects@vk-ping-pong-single-sem,Crash
-spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail
 spec@glsl-es-3.00@execution@built-in-functions@fs-packhalf2x16,Fail
 spec@glsl-es-3.00@execution@built-in-functions@vs-packhalf2x16,Fail
 spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
--- a/src/amd/ci/radeonsi-raven-fails.txt
+++ b/src/amd/ci/radeonsi-raven-fails.txt
@ -71,5 +71,4 @@ program@run kernel with max work item sizes,Fail
 # uprev Piglit in Mesa
 spec@ext_external_objects@vk-semaphores,Crash
 spec@ext_external_objects@vk-semaphores-2,Crash
-spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail

--- a/src/amd/ci/radeonsi-stoney-fails.txt
+++ b/src/amd/ci/radeonsi-stoney-fails.txt
@ -121,7 +121,6 @@ spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SR
 spec@ext_texture_srgb@texwrap formats-s3tc bordercolor-swizzled@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT- swizzled- border color only,Fail
 spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip_adjacency ffs,Fail
 spec@glsl-1.50@execution@geometry@tri-strip-ordering-with-prim-restart gl_triangle_strip_adjacency other,Fail
-spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail
 spec@glsl-es-3.00@execution@built-in-functions@fs-packhalf2x16,Fail
 spec@glsl-es-3.00@execution@built-in-functions@vs-packhalf2x16,Fail
 spec@khr_texture_compression_astc@miptree-gl srgb-fp,Fail
--- a/src/amd/ci/radeonsi-vangogh-fails.txt
+++ b/src/amd/ci/radeonsi-vangogh-fails.txt
@ -14,7 +14,6 @@ spec@egl_khr_surfaceless_context@viewport,Fail
 spec@ext_external_objects@vk-image-display,Crash
 spec@ext_external_objects@vk-semaphores,Crash
 spec@ext_external_objects@vk-semaphores-2,Crash
-spec@glsl-es-1.00@linker@glsl-mismatched-uniform-precision-unused,Fail
 spec@glsl-es-3.00@execution@built-in-functions@fs-packhalf2x16,Fail
 spec@glsl-es-3.00@execution@built-in-functions@vs-packhalf2x16,Fail
 spec@khr_texture_compression_astc@miptree-gles srgb-fp,Fail
--- a/src/amd/common/ac_cmdbuf_sdma.c
+++ b/src/amd/common/ac_cmdbuf_sdma.c
@ -222,10 +222,12 @@ static uint32_t
 ac_sdma_get_tiled_info_dword(const struct radeon_info *info,
                             const struct ac_sdma_surf_tiled *tiled)
 {
-   const uint32_t swizzle_mode = tiled->surf->has_stencil ? tiled->surf->u.gfx9.zs.stencil_swizzle_mode
-                                                          : tiled->surf->u.gfx9.swizzle_mode;
-   const uint16_t epitch = tiled->surf->has_stencil ? tiled->surf->u.gfx9.zs.stencil_epitch
-                                                    : tiled->surf->u.gfx9.epitch;
+   const uint32_t swizzle_mode =
+      tiled->is_stencil ? tiled->surf->u.gfx9.zs.stencil_swizzle_mode
+                        : tiled->surf->u.gfx9.swizzle_mode;
+   const uint16_t epitch =
+      tiled->is_stencil ? tiled->surf->u.gfx9.zs.stencil_epitch
+                        : tiled->surf->u.gfx9.epitch;
   const enum gfx9_resource_type dimension =
      ac_sdma_get_tiled_resource_dim(info->sdma_ip_version, tiled);
   const uint32_t mip_max = MAX2(tiled->num_levels, 1);
--- a/src/amd/common/ac_cmdbuf_sdma.h
+++ b/src/amd/common/ac_cmdbuf_sdma.h
@ -61,6 +61,7 @@ struct ac_sdma_surf_tiled {
   uint64_t va;
   enum pipe_format format;
   uint32_t bpp;
+   bool is_stencil;

   struct {
      uint32_t x;
--- a/src/amd/common/ac_descriptors.c
+++ b/src/amd/common/ac_descriptors.c
@ -1055,8 +1055,15 @@ ac_init_ds_surface(const struct radeon_info *info, const struct ac_ds_state *sta

 static unsigned
 ac_get_decompress_on_z_planes(const struct radeon_info *info, enum pipe_format format, uint8_t log_num_samples,
-                              bool htile_stencil_disabled, bool no_d16_compression)
+                              bool tc_compat_htile_enabled, bool htile_stencil_disabled, bool no_d16_compression,
+                              bool z_allow_expclear)
 {
+   if (info->gfx_level < GFX8)
+      return 0;
+
+   if (!tc_compat_htile_enabled)
+      return z_allow_expclear ? 15 : 0;
+
   uint32_t max_zplanes = 0;

   if (info->gfx_level >= GFX9) {
@ -1073,6 +1080,7 @@ ac_get_decompress_on_z_planes(const struct radeon_info *info, enum pipe_format f
         max_zplanes = 1;

      max_zplanes++;
+      assert(max_zplanes != 1); /* 1 is invalid and can cause corruption on gfx11-11.5 */
   } else {
      if (format == PIPE_FORMAT_Z16_UNORM && no_d16_compression) {
         /* Do not enable Z plane compression for 16-bit depth
@ -1093,6 +1101,7 @@ ac_get_decompress_on_z_planes(const struct radeon_info *info, enum pipe_format f
      }
   }

+   assert(max_zplanes != 10 && max_zplanes != 13); /* disallowed values */
   return max_zplanes;
 }

@ -1115,14 +1124,18 @@ ac_set_mutable_ds_surface_fields(const struct radeon_info *info, const struct ac
      log_num_samples = G_028040_NUM_SAMPLES(ds->db_z_info);
   }

+   bool z_allow_expclear = info->gfx_level <= GFX11_5 &&
+                           G_028038_ALLOW_EXPCLEAR(ds->db_z_info);
+
   const uint32_t max_zplanes =
      ac_get_decompress_on_z_planes(info, state->format, log_num_samples,
-                                    tile_stencil_disable, state->no_d16_compression);
+                                    state->tc_compat_htile_enabled, tile_stencil_disable,
+                                    state->no_d16_compression, z_allow_expclear);

   if (info->gfx_level >= GFX9) {
-      if (state->tc_compat_htile_enabled) {
-         ds->db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes);
+      ds->db_z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes);

+      if (state->tc_compat_htile_enabled) {
         if (info->gfx_level >= GFX10) {
            const bool iterate256 = log_num_samples >= 1;

@ -1138,12 +1151,13 @@ ac_set_mutable_ds_surface_fields(const struct radeon_info *info, const struct ac

      ds->db_z_info |= S_028038_ZRANGE_PRECISION(state->zrange_precision);
   } else {
-      if (state->tc_compat_htile_enabled) {
-         ds->u.gfx6.db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
+      if (info->gfx_level >= GFX8)
         ds->db_z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(max_zplanes);
-      } else {
+
+      if (state->tc_compat_htile_enabled)
+         ds->u.gfx6.db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
+      else
         ds->u.gfx6.db_depth_info |= S_02803C_ADDR5_SWIZZLE_MASK(1);
-      }

      ds->db_z_info |= S_028040_ZRANGE_PRECISION(state->zrange_precision);
   }
--- a/src/amd/common/ac_sqtt.c
+++ b/src/amd/common/ac_sqtt.c
@ -49,6 +49,8 @@ ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *da
 void
 ac_sqtt_init(struct ac_sqtt *data)
 {
+   simple_mtx_init(&data->lock, mtx_plain);
+
   list_inithead(&data->rgp_pso_correlation.record);
   simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain);

@ -71,6 +73,8 @@ ac_sqtt_init(struct ac_sqtt *data)
 void
 ac_sqtt_finish(struct ac_sqtt *data)
 {
+   simple_mtx_destroy(&data->lock);
+
   assert(data->rgp_pso_correlation.record_count == 0);
   simple_mtx_destroy(&data->rgp_pso_correlation.lock);

--- a/src/amd/common/ac_sqtt.h
+++ b/src/amd/common/ac_sqtt.h
@ -15,6 +15,7 @@
 #include "ac_pm4.h"
 #include "ac_rgp.h"
 #include "amd_family.h"
+#include "util/simple_mtx.h"

 #define SQTT_BUFFER_ALIGN_SHIFT 12

@ -61,6 +62,8 @@ struct ac_sqtt {
   struct rgp_clock_calibration rgp_clock_calibration;

   struct hash_table_u64 *pipeline_bos;
+
+   simple_mtx_t lock;
 };

 struct ac_sqtt_data_info {
--- a/src/amd/common/nir/ac_nir_lower_ps_late.c
+++ b/src/amd/common/nir/ac_nir_lower_ps_late.c
@ -443,10 +443,14 @@ emit_ps_color_export(nir_builder *b, lower_ps_state *s, unsigned output_index, u
   }
   }

-   s->exp[s->exp_num++] = nir_export_amd(b, nir_vec(b, outputs, 4),
-                                         .base = target,
-                                         .write_mask = write_mask,
-                                         .flags = flags);
+   nir_intrinsic_instr *exp = nir_export_amd(b, nir_vec(b, outputs, 4),
+                                             .base = target,
+                                             .flags = flags);
+
+   /* Set the writemask explicitly because write_mask=0 means full write mask. */
+   nir_intrinsic_set_write_mask(exp, write_mask);
+
+   s->exp[s->exp_num++] = exp;
   return true;
 }

@ -483,7 +487,7 @@ emit_ps_dual_src_blend_swizzle(nir_builder *b, lower_ps_state *s, unsigned first

   uint32_t mrt0_write_mask = nir_intrinsic_write_mask(mrt0_exp);
   uint32_t mrt1_write_mask = nir_intrinsic_write_mask(mrt1_exp);
-   uint32_t write_mask = mrt0_write_mask & mrt1_write_mask;
+   uint32_t write_mask = mrt0_write_mask | mrt1_write_mask;

   nir_def *mrt0_arg = mrt0_exp->src[0].ssa;
   nir_def *mrt1_arg = mrt1_exp->src[0].ssa;
--- a/src/amd/compiler/README-ISA.md
+++ b/src/amd/compiler/README-ISA.md
@ -216,6 +216,11 @@ the correct layout is:
 VOP2 `v_pk_fmac_f16`. But like all other packed math opcodes, DPP does not function in practice.
 RDNA1 and RDNA2 support `v_pk_fmac_f16_dpp`.

+## DPP with integer `subrev` and shifts
+
+No documentation mentions this, but DPP is seemingly applied to src1 instead of src0 for
+integer reverse subtract and shift opcodes.
+
 ## ds_swizzle_b32 rotate/fft modes

 These are first mentioned in the GFX9 (Vega) ISA doc, information from the LLVM bug tracker
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@ -1867,6 +1867,8 @@ resolve_all_gfx11(State& state, NOP_ctx_gfx11& ctx,
       ctx.vgpr_used_by_vmem_bvh.any()) {
      waitcnt_depctr &= 0xffe3;
      ctx.vgpr_used_by_vmem_load.reset();
+      ctx.vgpr_used_by_vmem_sample.reset();
+      ctx.vgpr_used_by_vmem_bvh.reset();
      ctx.vgpr_used_by_vmem_store.reset();
      ctx.vgpr_used_by_ds.reset();
   }
@ -1912,7 +1914,9 @@ handle_block(Program* program, Ctx& ctx, Block& block)
      Handle(state, ctx, instr, block.instructions);

      /* Resolve all possible hazards (we don't know what s_setpc_b64 jumps to). */
-      if (instr->opcode == aco_opcode::s_setpc_b64) {
+      if (instr->opcode == aco_opcode::s_setpc_b64 || instr->opcode == aco_opcode::s_swappc_b64 ||
+          instr->opcode == aco_opcode::s_call_b64) {
+         found_end |= instr->opcode == aco_opcode::s_setpc_b64;
         block.instructions.emplace_back(std::move(instr));

         std::vector<aco_ptr<Instruction>> resolve_instrs;
@ -1920,8 +1924,6 @@ handle_block(Program* program, Ctx& ctx, Block& block)
         block.instructions.insert(std::prev(block.instructions.end()),
                                   std::move_iterator(resolve_instrs.begin()),
                                   std::move_iterator(resolve_instrs.end()));
-
-         found_end = true;
         continue;
      }

--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@ -484,10 +484,17 @@ process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instructio
         Operand exit_cond = Operand(exec, bld.lm);

         if (state == Exact) {
-            assert(info.exec.size() == 1);
-            bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc), info.exec[0].op,
-                     src);
-            info.exec[0].op = Operand(exec, bld.lm);
+            bld.sop2(Builder::s_andn2, Definition(exec, bld.lm), bld.def(s1, scc),
+                     info.exec.back().op, src);
+            info.exec.back().op = Operand(exec, bld.lm);
+
+            /* Although this is in uniform CF, it might be a loop without back-edge.
+             * Update the loop restore mask as well.
+             */
+            for (unsigned i = 0; i < info.exec.size() - 1; i++) {
+               assert(info.exec[i + 1].type & mask_type_loop);
+               info.exec[i].op = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm));
+            }
         } else {
            Temp cond = bld.tmp(s1);
            info.exec[0].op = bld.sop2(Builder::s_andn2, bld.def(bld.lm), Definition(cond, scc),
--- a/src/amd/compiler/aco_insert_fp_mode.cpp
+++ b/src/amd/compiler/aco_insert_fp_mode.cpp
@ -233,9 +233,6 @@ instr_ignores_round_mode(const Instruction* instr)
   case aco_opcode::v_rndne_f64:
   case aco_opcode::v_rndne_f32:
   case aco_opcode::v_rndne_f16:
-   case aco_opcode::v_fract_f64:
-   case aco_opcode::v_fract_f32:
-   case aco_opcode::v_fract_f16:
   case aco_opcode::s_min_f32:
   case aco_opcode::s_min_f16:
   case aco_opcode::s_max_f32:
@ -454,16 +451,16 @@ emit_set_mode_block(fp_mode_ctx* ctx, Block* block)
      for (uint32_t pred : block->linear_preds)
         max_pred = MAX2(max_pred, pred);

-      assert(max_pred != 0);
-
-      mode_mask to_set = 0;
-      /* Check if the any mode was changed during the loop. */
-      u_foreach_bit (i, fp_state.required) {
-         if (ctx->last_set[i] <= max_pred)
-            to_set |= BITFIELD_BIT(i);
+      if (max_pred >= block->index) {
+         mode_mask to_set = 0;
+         /* Check if the any mode was changed during the loop. */
+         u_foreach_bit (i, fp_state.required) {
+            if (ctx->last_set[i] <= max_pred)
+               to_set |= BITFIELD_BIT(i);
+         }
+         if (to_set)
+            set_mode(ctx, block, fp_state, 0, to_set);
      }
-      if (to_set)
-         set_mode(ctx, block, fp_state, 0, to_set);
   }

   ctx->block_states[block->index] = fp_state;
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@ -391,6 +391,65 @@ convert_to_SDWA(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
   return tmp;
 }

+bool
+opcode_supports_dpp(amd_gfx_level gfx_level, aco_opcode opcode, bool vop3p)
+{
+   switch (opcode) {
+   /* reverse integer subtract and shift seem to apply dpp to src1 instead of src0 */
+   case aco_opcode::v_subrev_co_u32:
+   case aco_opcode::v_subrev_co_u32_e64:
+   case aco_opcode::v_subbrev_co_u32:
+   case aco_opcode::v_subrev_u16:
+   case aco_opcode::v_subrev_u32:
+   case aco_opcode::v_ashrrev_i32:
+   case aco_opcode::v_lshrrev_b32:
+   case aco_opcode::v_lshlrev_b32:
+   case aco_opcode::v_ashrrev_i16:
+   case aco_opcode::v_lshrrev_b16:
+   case aco_opcode::v_lshlrev_b16:
+   case aco_opcode::v_ashrrev_i16_e64:
+   case aco_opcode::v_lshrrev_b16_e64:
+   case aco_opcode::v_lshlrev_b16_e64: return false;
+   case aco_opcode::v_pk_fmac_f16: return gfx_level < GFX11;
+   /* there are more cases but those all take 64-bit inputs */
+   case aco_opcode::v_madmk_f32:
+   case aco_opcode::v_madak_f32:
+   case aco_opcode::v_madmk_f16:
+   case aco_opcode::v_madak_f16:
+   case aco_opcode::v_fmamk_f32:
+   case aco_opcode::v_fmaak_f32:
+   case aco_opcode::v_fmamk_f16:
+   case aco_opcode::v_fmaak_f16:
+   case aco_opcode::v_readfirstlane_b32:
+   case aco_opcode::v_cvt_f64_i32:
+   case aco_opcode::v_cvt_f64_f32:
+   case aco_opcode::v_cvt_f64_u32:
+   case aco_opcode::v_mul_lo_u32:
+   case aco_opcode::v_mul_lo_i32:
+   case aco_opcode::v_mul_hi_u32:
+   case aco_opcode::v_mul_hi_i32:
+   case aco_opcode::v_qsad_pk_u16_u8:
+   case aco_opcode::v_mqsad_pk_u16_u8:
+   case aco_opcode::v_mqsad_u32_u8:
+   case aco_opcode::v_mad_u64_u32:
+   case aco_opcode::v_mad_i64_i32:
+   case aco_opcode::v_permlane16_b32:
+   case aco_opcode::v_permlanex16_b32:
+   case aco_opcode::v_permlane64_b32:
+   case aco_opcode::v_readlane_b32_e64:
+   case aco_opcode::v_writelane_b32_e64: return false;
+   /* simpler than listing all VOP3P opcodes which do not support DPP */
+   case aco_opcode::v_fma_mix_f32:
+   case aco_opcode::v_fma_mixlo_f16:
+   case aco_opcode::v_fma_mixhi_f16:
+   case aco_opcode::p_v_fma_mixlo_f16_rtz:
+   case aco_opcode::p_v_fma_mixhi_f16_rtz:
+   case aco_opcode::v_dot2_f32_f16:
+   case aco_opcode::v_dot2_f32_bf16: return gfx_level >= GFX11;
+   default: return !vop3p;
+   }
+}
+
 bool
 can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8)
 {
@ -433,41 +492,7 @@ can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp
   if (instr->writes_exec())
      return false;

-   /* simpler than listing all VOP3P opcodes which do not support DPP */
-   if (instr->isVOP3P()) {
-      return instr->opcode == aco_opcode::v_fma_mix_f32 ||
-             instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
-             instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
-             instr->opcode == aco_opcode::p_v_fma_mixlo_f16_rtz ||
-             instr->opcode == aco_opcode::p_v_fma_mixhi_f16_rtz ||
-             instr->opcode == aco_opcode::v_dot2_f32_f16 ||
-             instr->opcode == aco_opcode::v_dot2_f32_bf16;
-   }
-
-   if (instr->opcode == aco_opcode::v_pk_fmac_f16)
-      return gfx_level < GFX11;
-
-   /* there are more cases but those all take 64-bit inputs */
-   return instr->opcode != aco_opcode::v_madmk_f32 && instr->opcode != aco_opcode::v_madak_f32 &&
-          instr->opcode != aco_opcode::v_madmk_f16 && instr->opcode != aco_opcode::v_madak_f16 &&
-          instr->opcode != aco_opcode::v_fmamk_f32 && instr->opcode != aco_opcode::v_fmaak_f32 &&
-          instr->opcode != aco_opcode::v_fmamk_f16 && instr->opcode != aco_opcode::v_fmaak_f16 &&
-          instr->opcode != aco_opcode::v_readfirstlane_b32 &&
-          instr->opcode != aco_opcode::v_cvt_f64_i32 &&
-          instr->opcode != aco_opcode::v_cvt_f64_f32 &&
-          instr->opcode != aco_opcode::v_cvt_f64_u32 && instr->opcode != aco_opcode::v_mul_lo_u32 &&
-          instr->opcode != aco_opcode::v_mul_lo_i32 && instr->opcode != aco_opcode::v_mul_hi_u32 &&
-          instr->opcode != aco_opcode::v_mul_hi_i32 &&
-          instr->opcode != aco_opcode::v_qsad_pk_u16_u8 &&
-          instr->opcode != aco_opcode::v_mqsad_pk_u16_u8 &&
-          instr->opcode != aco_opcode::v_mqsad_u32_u8 &&
-          instr->opcode != aco_opcode::v_mad_u64_u32 &&
-          instr->opcode != aco_opcode::v_mad_i64_i32 &&
-          instr->opcode != aco_opcode::v_permlane16_b32 &&
-          instr->opcode != aco_opcode::v_permlanex16_b32 &&
-          instr->opcode != aco_opcode::v_permlane64_b32 &&
-          instr->opcode != aco_opcode::v_readlane_b32_e64 &&
-          instr->opcode != aco_opcode::v_writelane_b32_e64;
+   return opcode_supports_dpp(gfx_level, instr->opcode, instr->isVOP3P());
 }

 aco_ptr<Instruction>
@ -889,7 +914,9 @@ needs_exec_mask(const Instruction* instr)
   if (instr->isSALU() || instr->isBranch() || instr->isSMEM() || instr->isBarrier())
      return instr->opcode == aco_opcode::s_cbranch_execz ||
             instr->opcode == aco_opcode::s_cbranch_execnz ||
-             instr->opcode == aco_opcode::s_setpc_b64 || instr->reads_exec();
+             instr->opcode == aco_opcode::s_setpc_b64 ||
+             instr->opcode == aco_opcode::s_swappc_b64 || instr->opcode == aco_opcode::s_call_b64 ||
+             instr->reads_exec();

   if (instr->isPseudo()) {
      switch (instr->opcode) {
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@ -2040,6 +2040,8 @@ bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
 uint8_t get_gfx11_true16_mask(aco_opcode op);
 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool pre_ra);
+bool opcode_supports_dpp(amd_gfx_level gfx_level, aco_opcode opcode, bool vop3p);
+bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
 bool can_use_DPP(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, bool dpp8);
 bool can_write_m0(const aco_ptr<Instruction>& instr);
 /* updates "instr" and returns the old instruction (or NULL if no update was needed) */
--- a/src/amd/compiler/aco_lower_branches.cpp
+++ b/src/amd/compiler/aco_lower_branches.cpp
@ -298,7 +298,9 @@ eliminate_useless_exec_writes_in_block(branch_ctx& ctx, Block& block)

      /* blocks_incoming_exec_used is initialized to true, so this is correct even for loops. */
      if (instr->opcode == aco_opcode::s_cbranch_scc0 ||
-          instr->opcode == aco_opcode::s_cbranch_scc1) {
+          instr->opcode == aco_opcode::s_cbranch_scc1 ||
+          instr->opcode == aco_opcode::s_cbranch_vccz ||
+          instr->opcode == aco_opcode::s_cbranch_vccnz) {
         exec_write_used |= ctx.blocks_incoming_exec_used[instr->salu().imm];
      }

--- a/src/amd/compiler/aco_nir_call_attribs.h
+++ b/src/amd/compiler/aco_nir_call_attribs.h
@ -22,7 +22,13 @@ enum aco_nir_function_attribs {
 };

 enum aco_nir_parameter_attribs {
-   /* Parameter value is not used by any callee and does not need to be preserved */
+   /* This parameter's value may not be preserved across a callee. Unlike return parameters, the
+    * parameter's value is undefined on return. Callers must back up values of discardable
+    * parameters separately.
+    * Mostly used for tail calls, where parameters to the tail callee have different values than
+    * for the caller. In that case, on function return, the parameters will have been overwritten
+    * with the tail callee parameter values.
+    */
   ACO_NIR_PARAM_ATTRIB_DISCARDABLE = 0x1,
 };

--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@ -427,6 +427,21 @@ process_block(vn_ctx& ctx, Block& block)
   block.instructions = std::move(new_instructions);
 }

+void
+dce_instructions(vn_ctx& ctx, Block& block)
+{
+   std::vector<aco_ptr<Instruction>> new_instructions;
+   new_instructions.reserve(block.instructions.size());
+
+   for (aco_ptr<Instruction>& instr : block.instructions) {
+      if (is_dead(ctx.uses, instr.get()))
+         continue;
+      new_instructions.emplace_back(std::move(instr));
+   }
+
+   block.instructions = std::move(new_instructions);
+}
+
 void
 rename_phi_operands(Block& block, aco::unordered_map<uint32_t, Temp>& renames)
 {
@ -467,10 +482,12 @@ value_numbering(Program* program)
      if (block.logical_idom == (int)block.index)
         ctx.expr_values.clear();

-      if (block.logical_idom != -1)
+      if (block.logical_idom != -1) {
         process_block(ctx, block);
-      else
+      } else {
+         dce_instructions(ctx, block);
         rename_phi_operands(block, ctx.renames);
+      }

      /* increment exec_id when entering nested control flow */
      if (block.kind & block_kind_branch || block.kind & block_kind_loop_preheader ||
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@ -1190,7 +1190,7 @@ alu_opt_gather_info(opt_ctx& ctx, Instruction* instr, alu_opt_info& info)
      info.operands.push_back({instr->operands[0]});
      if (instr->definitions[0].regClass() == s1) {
         info.defs.push_back(instr->definitions[1]);
-         info.opcode = aco_opcode::v_lshl_b32;
+         info.opcode = aco_opcode::s_lshl_b32;
         info.format = Format::SOP2;
         std::swap(info.operands[0], info.operands[1]);
      } else {
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@ -142,6 +142,10 @@ save_reg_writes(pr_opt_ctx& ctx, aco_ptr<Instruction>& instr)
      ctx.instr_idx_by_regs[ctx.current_block->index][instr->pseudo().scratch_sgpr] =
         overwritten_unknown_instr;
   }
+   if (instr->isCall()) {
+      std::fill(ctx.instr_idx_by_regs[ctx.current_block->index].begin(),
+                ctx.instr_idx_by_regs[ctx.current_block->index].end(), overwritten_unknown_instr);
+   }
 }

 Idx
@ -862,6 +866,8 @@ instr_overwrites(Instruction* instr, PhysReg reg, unsigned size)
      if (scratch_reg >= reg && reg + size > scratch_reg)
         return true;
   }
+   if (instr->isCall())
+      return true;
   return false;
 }

--- a/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_helpers.cpp
@ -672,7 +672,7 @@ build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
 Instruction*
 add_startpgm(struct isel_context* ctx, bool is_callee)
 {
-   ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size;
+   ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size * ctx->program->wave_size;

   unsigned def_count = 0;
   for (unsigned i = 0; i < ctx->args->arg_count; i++) {
@ -1034,8 +1034,7 @@ find_param_regs(Program* program, const ABI& abi, callee_info& info,

            param_demand += Temp(0, it2->rc);

-            it2->dst_info->needs_explicit_preservation =
-               regs == clobbered_regs && !it2->dst_info->discardable;
+            it2->dst_info->needs_explicit_preservation = regs == clobbered_regs;
            it2->dst_info->def.setPrecolored(*next_reg);
            for (unsigned i = 0; i < it2->rc.size(); ++i)
               BITSET_CLEAR(regs, next_reg->reg() + i);
@ -1051,8 +1050,7 @@ find_param_regs(Program* program, const ABI& abi, callee_info& info,
            next_reg = next_reg->advance(required_padding * 4);
      }
      if (next_reg) {
-         params.back().dst_info->needs_explicit_preservation =
-            regs == clobbered_regs && !params.back().dst_info->discardable;
+         params.back().dst_info->needs_explicit_preservation = regs == clobbered_regs;
         param_demand += Temp(0, params.back().rc);
         params.back().dst_info->def.setPrecolored(*next_reg);
         BITSET_CLEAR_COUNT(regs, next_reg->reg(), params.back().rc.size());
--- a/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_nir_intrinsics.cpp
@ -3392,7 +3392,10 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
      offset = as_vgpr(ctx, offset);
      for (unsigned i = 0; i < write_count; i++) {
         aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
-         Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offsets.back(),
+         Operand soffset = Operand::c32(0);
+         if (!ctx->program->scratch_offsets.empty())
+            soffset = Operand(ctx->program->scratch_offsets.back());
+         Instruction* mubuf = bld.mubuf(op, rsrc, offset, soffset,
                                        write_datas[i], offsets[i], true);
         mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
         enum ac_access_type type =
--- a/src/amd/vulkan/bvh/encode.comp
+++ b/src/amd/vulkan/bvh/encode.comp
@ -145,7 +145,7 @@ main()
                                             ir_id_to_offset(children[i]))).aabb;

            float surface_area = aabb_surface_area(bounds);
-            if (surface_area > largest_surface_area) {
+            if (surface_area > largest_surface_area || collapsed_child_index == -1) {
               largest_surface_area = surface_area;
               collapsed_child_index = i;
            }
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@ -778,8 +778,11 @@ sqtt_QueueSubmit2(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *pSu
   if (queue->sqtt_present)
      return radv_sqtt_wsi_submit(_queue, submitCount, pSubmits, _fence);

-   if (instance->vk.trace_per_submit)
+   if (instance->vk.trace_per_submit) {
+      /* Make sure to lock in case of multithreaded submissions. */
+      simple_mtx_lock(&device->sqtt.lock);
      radv_sqtt_start_capturing(queue);
+   }

   for (uint32_t i = 0; i < submitCount; i++) {
      const VkSubmitInfo2 *pSubmit = &pSubmits[i];
@ -863,12 +866,17 @@ sqtt_QueueSubmit2(VkQueue _queue, uint32_t submitCount, const VkSubmitInfo2 *pSu
                 "radv: Failed to capture RGP for this submit because the buffer is too small and auto-resizing "
                 "is disabled. See RADV_THREAD_TRACE_BUFFER_SIZE for increasing the size.\n");
      }
+      simple_mtx_unlock(&device->sqtt.lock);
   }

   return result;

 fail:
   FREE(new_cmdbufs);
+
+   if (instance->vk.trace_per_submit) {
+      simple_mtx_unlock(&device->sqtt.lock);
+   }
   return result;
 }

--- a/src/amd/vulkan/layers/radv_strange_brigade.c
+++ b/src/amd/vulkan/layers/radv_strange_brigade.c
@ -0,0 +1,31 @@
+/*
+ * Copyright © 2026 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "radv_cmd_buffer.h"
+#include "radv_device.h"
+#include "radv_entrypoints.h"
+
+VKAPI_ATTR void VKAPI_CALL
+strange_brigade_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo)
+{
+   VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+
+   for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
+      VkImageMemoryBarrier2 *barrier = (VkImageMemoryBarrier2 *)&pDependencyInfo->pImageMemoryBarriers[i];
+
+      if (barrier->newLayout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
+          barrier->srcAccessMask == VK_ACCESS_COLOR_ATTACHMENT_READ_BIT) {
+         /* This game has a broken barrier right before present that causes rendering issues. Fix it
+          * by modifying the src access mask.
+          */
+         barrier->srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+         break;
+      }
+   }
+
+   device->layer_dispatch.app.CmdPipelineBarrier2(commandBuffer, pDependencyInfo);
+}
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@ -22,6 +22,7 @@ radv_entrypoints_gen_command += [
  '--device-prefix', 'rage2',
  '--device-prefix', 'quantic_dream',
  '--device-prefix', 'no_mans_sky',
+  '--device-prefix', 'strange_brigade',

  # Command buffer annotation layer entrypoints
  '--device-prefix', 'annotate',
@ -42,6 +43,7 @@ libradv_files = files(
  'layers/radv_rage2.c',
  'layers/radv_quantic_dream.c',
  'layers/radv_no_mans_sky.c',
+  'layers/radv_strange_brigade.c',
  'layers/radv_rmv_layer.c',
  'layers/radv_rra_layer.c',
  'layers/radv_sqtt_layer.c',
--- a/src/amd/vulkan/meta/radv_meta.h
+++ b/src/amd/vulkan/meta/radv_meta.h
@ -97,6 +97,7 @@ enum radv_meta_object_key_type {
   RADV_META_OBJECT_KEY_CLEAR_HIZ,
   RADV_META_OBJECT_KEY_FAST_CLEAR_ELIMINATE,
   RADV_META_OBJECT_KEY_DCC_DECOMPRESS,
+   RADV_META_OBJECT_KEY_DCC_DECOMPRESS_CS,
   RADV_META_OBJECT_KEY_DCC_RETILE,
   RADV_META_OBJECT_KEY_HTILE_EXPAND_GFX,
   RADV_META_OBJECT_KEY_HTILE_EXPAND_CS,
--- a/src/amd/vulkan/meta/radv_meta_clear.c
+++ b/src/amd/vulkan/meta/radv_meta_clear.c
@ -1475,7 +1475,8 @@ radv_can_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, const struct radv_
 static void
 radv_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
                      const VkClearAttachment *clear_att, const VkClearRect *clear_rect,
-                      enum radv_cmd_flush_bits *pre_flush, enum radv_cmd_flush_bits *post_flush)
+                      enum radv_cmd_flush_bits *pre_flush, enum radv_cmd_flush_bits *post_flush,
+                      uint32_t view_mask)
 {
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
   const struct radv_physical_device *pdev = radv_device_physical(device);
@ -1488,7 +1489,8 @@ radv_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, const struct radv_imag
      .baseMipLevel = iview->vk.base_mip_level,
      .levelCount = iview->vk.level_count,
      .baseArrayLayer = iview->vk.base_array_layer + clear_rect->baseArrayLayer,
-      .layerCount = clear_rect->layerCount,
+      /* radv_can_fast_clear_color blocks multiview fast clears unless the viewmask contains all layers */
+      .layerCount = view_mask ? iview->vk.layer_count : clear_rect->layerCount,
   };

   if (pre_flush) {
@ -1575,7 +1577,7 @@ emit_clear(struct radv_cmd_buffer *cmd_buffer, const VkClearAttachment *clear_at

      if (radv_can_fast_clear_color(cmd_buffer, color_att->iview, color_att->layout, clear_rect, clear_value,
                                    view_mask)) {
-         radv_fast_clear_color(cmd_buffer, color_att->iview, clear_att, clear_rect, pre_flush, post_flush);
+         radv_fast_clear_color(cmd_buffer, color_att->iview, clear_att, clear_rect, pre_flush, post_flush, view_mask);
      } else {
         emit_color_clear(cmd_buffer, clear_att, clear_rect, view_mask);
      }
@ -1877,7 +1879,7 @@ radv_fast_clear_range(struct radv_cmd_buffer *cmd_buffer, struct radv_image *ima

   if (vk_format_is_color(format)) {
      if (radv_can_fast_clear_color(cmd_buffer, &iview, image_layout, &clear_rect, clear_att.clearValue.color, 0)) {
-         radv_fast_clear_color(cmd_buffer, &iview, &clear_att, &clear_rect, NULL, NULL);
+         radv_fast_clear_color(cmd_buffer, &iview, &clear_att, &clear_rect, NULL, NULL, 0);
         fast_cleared = true;
      }
   } else {
--- a/src/amd/vulkan/meta/radv_meta_copy.c
+++ b/src/amd/vulkan/meta/radv_meta_copy.c
@ -144,6 +144,40 @@ gfx_or_compute_copy_memory_to_image(struct radv_cmd_buffer *cmd_buffer, uint64_t
                  (use_compute ? RADV_META_SAVE_COMPUTE_PIPELINE : RADV_META_SAVE_GRAPHICS_PIPELINE) |
                     RADV_META_SAVE_CONSTANTS | RADV_META_SAVE_DESCRIPTORS);

+   if (use_compute) {
+      /* For partial copies, HTILE is decompressed before because image stores don't write the
+       * uncompressed DWORD to HTILE. And then it's needed to re-initialize HTILE to its
+       * uncompressed state after the copy.
+       */
+      const bool is_partial_copy = region->imageOffset.x || region->imageOffset.y || region->imageOffset.z ||
+                                   region->imageExtent.width != image->vk.extent.width ||
+                                   region->imageExtent.height != image->vk.extent.height ||
+                                   region->imageExtent.depth != image->vk.extent.depth;
+
+      uint32_t queue_mask = radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf);
+
+      if (radv_layout_is_htile_compressed(device, image, region->imageSubresource.mipLevel, layout, queue_mask) &&
+          is_partial_copy) {
+         radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_UNKNOWN_REASON);
+
+         u_foreach_bit (i, region->imageSubresource.aspectMask) {
+            unsigned aspect_mask = 1u << i;
+            radv_expand_depth_stencil(
+               cmd_buffer, image,
+               &(VkImageSubresourceRange){
+                  .aspectMask = aspect_mask,
+                  .baseMipLevel = region->imageSubresource.mipLevel,
+                  .levelCount = 1,
+                  .baseArrayLayer = region->imageSubresource.baseArrayLayer,
+                  .layerCount = vk_image_subresource_layer_count(&image->vk, &region->imageSubresource),
+               },
+               NULL);
+         }
+
+         radv_describe_barrier_end(cmd_buffer);
+      }
+   }
+
   /**
    * From the Vulkan 1.0.6 spec: 18.3 Copying Data Between Images
    *    extent is the size in texels of the source image to copy in width,
@ -222,6 +256,27 @@ gfx_or_compute_copy_memory_to_image(struct radv_cmd_buffer *cmd_buffer, uint64_t
         slice_array++;
   }

+   if (use_compute) {
+      /* Fixup HTILE after a copy on compute. */
+      uint32_t queue_mask = radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf);
+
+      if (radv_layout_is_htile_compressed(device, image, region->imageSubresource.mipLevel, layout, queue_mask)) {
+         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE;
+
+         VkImageSubresourceRange range = {
+            .aspectMask = region->imageSubresource.aspectMask,
+            .baseMipLevel = region->imageSubresource.mipLevel,
+            .levelCount = 1,
+            .baseArrayLayer = region->imageSubresource.baseArrayLayer,
+            .layerCount = vk_image_subresource_layer_count(&image->vk, &region->imageSubresource),
+         };
+
+         uint32_t htile_value = radv_get_htile_initial_value(device, image);
+
+         cmd_buffer->state.flush_bits |= radv_clear_htile(cmd_buffer, image, &range, htile_value, false);
+      }
+   }
+
   radv_meta_restore(&saved_state, cmd_buffer);
 }

--- a/src/amd/vulkan/meta/radv_meta_fast_clear.c
+++ b/src/amd/vulkan/meta/radv_meta_fast_clear.c
@ -8,6 +8,7 @@
 #include <stdbool.h>

 #include "nir/radv_meta_nir.h"
+#include "radv_cs.h"
 #include "radv_meta.h"

 enum radv_color_op {
@ -19,7 +20,7 @@ enum radv_color_op {
 static VkResult
 get_dcc_decompress_compute_pipeline(struct radv_device *device, VkPipeline *pipeline_out, VkPipelineLayout *layout_out)
 {
-   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_DCC_DECOMPRESS;
+   enum radv_meta_object_key_type key = RADV_META_OBJECT_KEY_DCC_DECOMPRESS_CS;
   VkResult result;

   const VkDescriptorSetLayoutBinding bindings[] = {
@ -241,6 +242,7 @@ radv_process_color_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_i
                               const VkImageSubresourceRange *range, int level, int layer, enum radv_color_op op)
 {
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
+   const struct radv_physical_device *pdev = radv_device_physical(device);
   struct radv_image_view iview;
   uint32_t width, height;

@ -303,9 +305,23 @@ radv_process_color_image_layer(struct radv_cmd_buffer *cmd_buffer, struct radv_i

   radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);

-   if (op == FMASK_DECOMPRESS || op == DCC_DECOMPRESS)
+   if (op == FMASK_DECOMPRESS || op == DCC_DECOMPRESS) {
+      /* On GFX6-8, the CB FMASK cache writes corrupted data if cache lines are flushed after their
+       * context has been retired. To avoid this, we must flush the CB metadata caches immediately
+       * after every FMASK decompress.
+       *
+       * PAL only applies this workaround on GFX6 but GFX7-8 are also affected and that matches
+       * RadeonSI.
+       */
+      if (pdev->info.gfx_level <= GFX8 && op == FMASK_DECOMPRESS) {
+         radeon_begin(cmd_buffer->cs);
+         radeon_event_write(V_028A90_FLUSH_AND_INV_CB_META);
+         radeon_end();
+      }
+
      cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
                                                            VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, 0, image, range);
+   }

   const VkRenderingEndInfoKHR end_info = {
      .sType = VK_STRUCTURE_TYPE_RENDERING_END_INFO_KHR,
--- a/src/amd/vulkan/meta/radv_meta_resolve_cs.c
+++ b/src/amd/vulkan/meta/radv_meta_resolve_cs.c
@ -467,7 +467,9 @@ radv_meta_resolve_depth_stencil_cs(struct radv_cmd_buffer *cmd_buffer, struct ra

   radv_CmdBindPipeline(radv_cmd_buffer_to_handle(cmd_buffer), VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);

-   const uint32_t push_constants[2] = {region->srcOffset.x, region->srcOffset.y};
+   const uint32_t push_constants[5] = {
+      region->srcOffset.x, region->srcOffset.y, region->dstOffset.x, region->dstOffset.y, region->dstOffset.z,
+   };

   const VkPushConstantsInfoKHR pc_info = {
      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO_KHR,
--- a/src/amd/vulkan/meta/radv_meta_resolve_fs.c
+++ b/src/amd/vulkan/meta/radv_meta_resolve_fs.c
@ -669,8 +669,8 @@ radv_meta_resolve_depth_stencil_fs(struct radv_cmd_buffer *cmd_buffer, struct ra

   radv_CmdSetViewport(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1,
                       &(VkViewport){
-                          .x = region->srcOffset.x,
-                          .y = region->srcOffset.y,
+                          .x = region->dstOffset.x,
+                          .y = region->dstOffset.y,
                          .width = region->extent.width,
                          .height = region->extent.height,
                          .minDepth = 0.0f,
@ -679,6 +679,22 @@ radv_meta_resolve_depth_stencil_fs(struct radv_cmd_buffer *cmd_buffer, struct ra

   radv_CmdSetScissor(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &resolve_area);

+   const uint32_t push_constants[2] = {
+      region->srcOffset.x - region->dstOffset.x,
+      region->srcOffset.y - region->dstOffset.y,
+   };
+
+   const VkPushConstantsInfoKHR push_constants_info = {
+      .sType = VK_STRUCTURE_TYPE_PUSH_CONSTANTS_INFO,
+      .layout = layout,
+      .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT,
+      .offset = 0,
+      .size = sizeof(push_constants),
+      .pValues = push_constants,
+   };
+
+   radv_CmdPushConstants2(radv_cmd_buffer_to_handle(cmd_buffer), &push_constants_info);
+
   radv_CmdDraw(radv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);

   const VkRenderingEndInfoKHR end_info = {
--- a/src/amd/vulkan/nir/radv_meta_nir.c
+++ b/src/amd/vulkan/nir/radv_meta_nir.c
@ -1395,19 +1395,21 @@ radv_meta_nir_build_depth_stencil_resolve_compute_shader(struct radv_device *dev

   nir_def *global_id = radv_meta_nir_get_global_ids(&b, 3);

-   nir_def *offset = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 8);
+   nir_def *src_offset = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 8);
+   nir_def *dst_offset = nir_load_push_constant(&b, 3, 32, nir_imm_int(&b, 8), .range = 20);

-   nir_def *resolve_coord = nir_iadd(&b, nir_trim_vector(&b, global_id, 2), offset);
+   nir_def *src_coord = nir_iadd(&b, nir_trim_vector(&b, global_id, 2), src_offset);
+   nir_def *dst_coord = nir_iadd(&b, global_id, dst_offset);

-   nir_def *img_coord =
-      nir_vec3(&b, nir_channel(&b, resolve_coord, 0), nir_channel(&b, resolve_coord, 1), nir_channel(&b, global_id, 2));
+   nir_def *src_img_coord =
+      nir_vec3(&b, nir_channel(&b, src_coord, 0), nir_channel(&b, src_coord, 1), nir_channel(&b, global_id, 2));

   nir_deref_instr *input_img_deref = nir_build_deref_var(&b, input_img);
-   nir_def *outval = nir_txf_ms(&b, img_coord, nir_imm_int(&b, 0), .texture_deref = input_img_deref);
+   nir_def *outval = nir_txf_ms(&b, src_img_coord, nir_imm_int(&b, 0), .texture_deref = input_img_deref);

   if (resolve_mode != VK_RESOLVE_MODE_SAMPLE_ZERO_BIT) {
      for (int i = 1; i < samples; i++) {
-         nir_def *si = nir_txf_ms(&b, img_coord, nir_imm_int(&b, i), .texture_deref = input_img_deref);
+         nir_def *si = nir_txf_ms(&b, src_img_coord, nir_imm_int(&b, i), .texture_deref = input_img_deref);

         switch (resolve_mode) {
         case VK_RESOLVE_MODE_AVERAGE_BIT:
@ -1435,8 +1437,8 @@ radv_meta_nir_build_depth_stencil_resolve_compute_shader(struct radv_device *dev
         outval = nir_fdiv_imm(&b, outval, samples);
   }

-   nir_def *coord = nir_vec4(&b, nir_channel(&b, img_coord, 0), nir_channel(&b, img_coord, 1),
-                             nir_channel(&b, img_coord, 2), nir_undef(&b, 1, 32));
+   nir_def *coord = nir_vec4(&b, nir_channel(&b, dst_coord, 0), nir_channel(&b, dst_coord, 1),
+                             nir_channel(&b, dst_coord, 2), nir_undef(&b, 1, 32));
   nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_undef(&b, 1, 32), outval,
                         nir_imm_int(&b, 0), .image_dim = GLSL_SAMPLER_DIM_2D, .image_array = true);
   return b.shader;
@ -1495,10 +1497,11 @@ radv_meta_nir_build_depth_stencil_resolve_fragment_shader(struct radv_device *de
   fs_out->data.location = index == RADV_META_DEPTH_RESOLVE ? FRAG_RESULT_DEPTH : FRAG_RESULT_STENCIL;

   nir_def *pos_in = nir_trim_vector(&b, nir_load_frag_coord(&b), 2);
+   nir_def *src_offset = nir_load_push_constant(&b, 2, 32, nir_imm_int(&b, 0), .range = 8);

   nir_def *pos_int = nir_f2i32(&b, pos_in);

-   nir_def *img_coord = nir_trim_vector(&b, pos_int, 2);
+   nir_def *img_coord = nir_trim_vector(&b, nir_iadd(&b, pos_int, src_offset), 2);

   nir_deref_instr *input_img_deref = nir_build_deref_var(&b, input_img);
   nir_def *outval = nir_txf_ms(&b, img_coord, nir_imm_int(&b, 0), .texture_deref = input_img_deref);
--- a/src/amd/vulkan/nir/radv_nir_lower_call_abi.c
+++ b/src/amd/vulkan/nir/radv_nir_lower_call_abi.c
@ -114,11 +114,32 @@ gather_tail_call_instrs_block(nir_function *caller, const struct nir_block *bloc
      if (call->callee->num_params != caller->num_params)
         return;

-      for (unsigned i = 0; i < call->num_params; ++i) {
+      for (unsigned i = 0; i < call->callee->num_params; ++i) {
         if (call->callee->params[i].is_return != caller->params[i].is_return)
            return;
+         if ((call->callee->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE) &&
+             !(caller->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE))
+            return;
+         bool has_preserved_regs =
+            (caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == ACO_NIR_CALL_ABI_AHIT_ISEC;
+         if (has_preserved_regs && ((call->callee->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE) !=
+                                    (caller->params[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE)))
+            return;
+         if (call->callee->params[i].is_uniform != caller->params[i].is_uniform)
+            return;
+         if (call->callee->params[i].bit_size != caller->params[i].bit_size)
+            return;
+         if (call->callee->params[i].num_components != caller->params[i].num_components)
+            return;
+      }
+
+      /* The call instruction itself has not been lowered to the new signature yet, so do this in a separate loop and
+       * adjust parameter indices for the caller.
+       */
+      for (unsigned i = 0; i < call->num_params; ++i) {
+         unsigned caller_param_idx = i + ACO_NIR_CALL_SYSTEM_ARG_COUNT;
         /* We can only do tail calls if the caller returns exactly the callee return values */
-         if (caller->params[i].is_return) {
+         if (caller->params[caller_param_idx].is_return) {
            assert(nir_def_as_deref_or_null(call->params[i].ssa));
            nir_deref_instr *deref_root = nir_def_as_deref(call->params[i].ssa);
            while (nir_deref_instr_parent(deref_root))
@ -129,16 +150,18 @@ gather_tail_call_instrs_block(nir_function *caller, const struct nir_block *bloc
            nir_intrinsic_instr *intrin = nir_def_as_intrinsic_or_null(deref_root->parent.ssa);
            if (!intrin || intrin->intrinsic != nir_intrinsic_load_param)
               return;
-            /* The call parameters aren't lowered at this point, we need to add the call arg count here */
-            if (nir_intrinsic_param_idx(intrin) != i + ACO_NIR_CALL_SYSTEM_ARG_COUNT)
+            if (nir_intrinsic_param_idx(intrin) != caller_param_idx)
+               return;
+         } else if (!(caller->params[caller_param_idx].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE)) {
+            /* If the parameter is not marked as discardable, then we have to preserve the caller's value. Passing
+             * a modified value to a tail call leaves us unable to restore the original value, so bail out if we have
+             * modified parameters.
+             */
+            nir_intrinsic_instr *intrin = nir_def_as_intrinsic_or_null(call->params[i].ssa);
+            if (!intrin || intrin->intrinsic != nir_intrinsic_load_param ||
+                nir_intrinsic_param_idx(intrin) != caller_param_idx)
               return;
         }
-         if (call->callee->params[i].is_uniform != caller->params[i].is_uniform)
-            return;
-         if (call->callee->params[i].bit_size != caller->params[i].bit_size)
-            return;
-         if (call->callee->params[i].num_components != caller->params[i].num_components)
-            return;
      }

      _mesa_set_add(tail_calls, instr);
--- a/src/amd/vulkan/nir/radv_nir_lower_ray_queries.c
+++ b/src/amd/vulkan/nir/radv_nir_lower_ray_queries.c
@ -144,6 +144,7 @@ radv_get_ray_query_type()
 struct ray_query_vars {
   nir_variable *var;

+   bool use_bvh_stack_rtn;
   bool shared_stack;
   uint32_t shared_base;
   uint32_t stack_entries;
@ -162,13 +163,21 @@ init_ray_query_vars(nir_shader *shader, const glsl_type *opaque_type, struct ray
   uint32_t shared_stack_entries = shader->info.ray_queries == 1 ? 16 : 8;
   /* ds_bvh_stack* instructions use a fixed stride of 32 dwords. */
   if (radv_use_bvh_stack_rtn(pdev))
-      workgroup_size = MAX2(workgroup_size, 32);
+      workgroup_size = align(workgroup_size, 32);
   uint32_t shared_stack_size = workgroup_size * shared_stack_entries * 4;
   uint32_t shared_offset = align(shader->info.shared_size, 4);
+
   if (shader->info.stage != MESA_SHADER_COMPUTE || glsl_type_is_array(opaque_type) ||
       shared_offset + shared_stack_size > pdev->max_shared_size) {
      dst->stack_entries = MAX_SCRATCH_STACK_ENTRY_COUNT;
   } else {
+      if (radv_use_bvh_stack_rtn(pdev)) {
+         /* The hardware ds_bvh_stack_rtn address can only encode a stack base up to 8191 dwords. */
+         uint32_t num_wave32_groups = workgroup_size / 32;
+         uint32_t max_group_stack_base = (num_wave32_groups - 1) * 32 * shared_stack_entries;
+         uint32_t max_stack_base = (shared_offset / 4) + max_group_stack_base;
+         dst->use_bvh_stack_rtn = max_stack_base < 8192;
+      }
      dst->shared_stack = true;
      dst->shared_base = shared_offset;
      dst->stack_entries = shared_stack_entries;
@ -303,7 +312,7 @@ lower_rq_initialize(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query

   if (vars->shared_stack) {
      nir_def *stack_idx = nir_load_local_invocation_index(b);
-      if (radv_use_bvh_stack_rtn(pdev)) {
+      if (vars->use_bvh_stack_rtn) {
         uint32_t workgroup_size =
            b->shader->info.workgroup_size[0] * b->shader->info.workgroup_size[1] * b->shader->info.workgroup_size[2];
         nir_def *addr =
@ -563,7 +572,7 @@ lower_rq_proceed(nir_builder *b, nir_intrinsic_instr *instr, struct ray_query_va
   };

   if (vars->shared_stack) {
-      args.use_bvh_stack_rtn = radv_use_bvh_stack_rtn(pdev);
+      args.use_bvh_stack_rtn = vars->use_bvh_stack_rtn;
      if (args.use_bvh_stack_rtn) {
         args.stack_stride = 1;
         args.stack_base = 0;
--- a/src/amd/vulkan/nir/radv_nir_rt_stage_functions.c
+++ b/src/amd/vulkan/nir/radv_nir_rt_stage_functions.c
@ -39,7 +39,7 @@ radv_nir_init_traversal_params(nir_function *function, unsigned payload_size)
   function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params);
   radv_nir_init_common_rt_params(function);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_TRAVERSAL_ADDR, glsl_uint64_t_type(), true, 0);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_SHADER_RECORD_PTR, glsl_uint64_t_type(), false, 0);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_SHADER_RECORD_PTR, glsl_uint64_t_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_ACCEL_STRUCT, glsl_uint64_t_type(), false, 0);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_CULL_MASK_AND_FLAGS, glsl_uint_type(), false, 0);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_SBT_OFFSET, glsl_uint_type(), false, 0);
@ -49,12 +49,13 @@ radv_nir_init_traversal_params(nir_function *function, unsigned payload_size)
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_RAY_TMIN, glsl_float_type(), false, 0);
   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_RAY_DIRECTION, glsl_vector_type(GLSL_TYPE_UINT, 3), false,
                            0);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_RAY_TMAX, glsl_float_type(), false, 0);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_PRIMITIVE_ADDR, glsl_uint64_t_type(), false, 0);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_PRIMITIVE_ID, glsl_uint_type(), false, 0);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_INSTANCE_ADDR, glsl_uint64_t_type(), false, 0);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS, glsl_uint_type(), false, 0);
-   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_HIT_KIND, glsl_uint_type(), false, 0);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_RAY_TMAX, glsl_float_type(), false,
+                            ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_PRIMITIVE_ADDR, glsl_uint64_t_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_PRIMITIVE_ID, glsl_uint_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_INSTANCE_ADDR, glsl_uint64_t_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS, glsl_uint_type(), false,  ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
+   radv_nir_param_from_type(function->params + TRAVERSAL_ARG_HIT_KIND, glsl_uint_type(), false, ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
   for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) {
      radv_nir_return_param_from_type(function->params + TRAVERSAL_ARG_PAYLOAD_BASE + i, glsl_uint_type(), false, 0);
   }
@ -128,15 +129,11 @@ radv_nir_init_rt_function_params(nir_function *function, mesa_shader_stage stage
      radv_nir_init_common_rt_params(function);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_TRAVERSAL_ADDR, glsl_uint64_t_type(), true, 0);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SHADER_RECORD_PTR, glsl_uint64_t_type(), false, 0);
-      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_ACCEL_STRUCT, glsl_uint64_t_type(), false,
-                               ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
+      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_ACCEL_STRUCT, glsl_uint64_t_type(), false, 0);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_CULL_MASK_AND_FLAGS, glsl_uint_type(), false, 0);
-      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SBT_OFFSET, glsl_uint_type(), false,
-                               ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
-      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SBT_STRIDE, glsl_uint_type(), false,
-                               ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
-      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_MISS_INDEX, glsl_uint_type(), false,
-                               ACO_NIR_PARAM_ATTRIB_DISCARDABLE);
+      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SBT_OFFSET, glsl_uint_type(), false, 0);
+      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_SBT_STRIDE, glsl_uint_type(), false, 0);
+      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_MISS_INDEX, glsl_uint_type(), false, 0);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_RAY_ORIGIN, glsl_vector_type(GLSL_TYPE_UINT, 3), false,
                               0);
      radv_nir_param_from_type(function->params + CHIT_MISS_ARG_RAY_TMIN, glsl_float_type(), false, 0);
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -9550,9 +9550,9 @@ radv_handle_color_fbfetch_output(struct radv_cmd_buffer *cmd_buffer, uint32_t in
   radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_UNKNOWN_REASON);

   /* Force a transition to FEEDBACK_LOOP_OPTIMAL to decompress DCC. */
-   radv_handle_image_transition(cmd_buffer, att->iview->image, att->layout,
-                                VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, RADV_QUEUE_GENERAL,
-                                RADV_QUEUE_GENERAL, &range, NULL);
+   radv_handle_rendering_image_transition(
+      cmd_buffer, att->iview, render->layer_count, render->view_mask, att->layout, VK_IMAGE_LAYOUT_UNDEFINED,
+      VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, VK_IMAGE_LAYOUT_UNDEFINED, NULL);

   radv_describe_barrier_end(cmd_buffer);

@ -9597,9 +9597,10 @@ radv_handle_depth_fbfetch_output(struct radv_cmd_buffer *cmd_buffer)
   radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_UNKNOWN_REASON);

   /* Force a transition to FEEDBACK_LOOP_OPTIMAL to decompress HTILE. */
-   radv_handle_image_transition(cmd_buffer, att->iview->image, att->layout,
-                                VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, RADV_QUEUE_GENERAL,
-                                RADV_QUEUE_GENERAL, &range, NULL);
+   radv_handle_rendering_image_transition(cmd_buffer, att->iview, render->layer_count, render->view_mask, att->layout,
+                                          att->stencil_layout, VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT,
+                                          VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT,
+                                          render->sample_locations.count > 0 ? &render->sample_locations : NULL);

   radv_describe_barrier_end(cmd_buffer);

@ -9642,16 +9643,19 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
   VK_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
   struct radv_device *device = radv_cmd_buffer_device(primary);
   const struct radv_physical_device *pdev = radv_device_physical(device);
+   const bool is_gfx_or_ace = primary->qf == RADV_QUEUE_GENERAL || primary->qf == RADV_QUEUE_COMPUTE;

   assert(commandBufferCount > 0);

-   radv_emit_mip_change_flush_default(primary);
+   if (is_gfx_or_ace) {
+      radv_emit_mip_change_flush_default(primary);

-   /* Emit pending flushes on primary prior to executing secondary */
-   radv_emit_cache_flush(primary);
+      /* Emit pending flushes on primary prior to executing secondary */
+      radv_emit_cache_flush(primary);

-   /* Make sure CP DMA is idle on primary prior to executing secondary. */
-   radv_cp_dma_wait_for_idle(primary);
+      /* Make sure CP DMA is idle on primary prior to executing secondary. */
+      radv_cp_dma_wait_for_idle(primary);
+   }

   for (uint32_t i = 0; i < commandBufferCount; i++) {
      VK_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
@ -9694,6 +9698,9 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou
         if (primary->state.dirty & RADV_CMD_DIRTY_FBFETCH_OUTPUT) {
            radv_handle_fbfetch_output(primary);
            primary->state.dirty &= ~RADV_CMD_DIRTY_FBFETCH_OUTPUT;
+
+            /* Emit pending flushes if a late decompression was performed. */
+            radv_emit_cache_flush(primary);
         }

         if (primary->state.render.active && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
@ -9769,23 +9776,12 @@ radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCou

      device->ws->cs_execute_secondary(primary_cs->b, secondary_cs->b, allow_ib2);

-      /* When the secondary command buffer is compute only we don't
-       * need to re-emit the current graphics pipeline.
-       */
-      if (secondary->state.emitted_graphics_pipeline) {
-         primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
-      }
+      primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
+      primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
+      primary->state.emitted_rt_pipeline = secondary->state.emitted_rt_pipeline;

-      /* When the secondary command buffer is graphics only we don't
-       * need to re-emit the current compute pipeline.
-       */
-      if (secondary->state.emitted_compute_pipeline) {
-         primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
-      }
-
-      if (secondary->state.emitted_rt_pipeline) {
-         primary->state.emitted_rt_pipeline = secondary->state.emitted_rt_pipeline;
-      }
+      primary->state.ps_epilog = secondary->state.ps_epilog;
+      primary->state.emitted_vs_prolog = secondary->state.emitted_vs_prolog;

      if (secondary->state.last_ia_multi_vgt_param) {
         primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
@ -15174,10 +15170,19 @@ radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstC

   assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);

-   if (pdev->info.gfx_level >= GFX12)
+   if (pdev->info.gfx_level >= GFX12) {
      radv_init_streamout_state(cmd_buffer);
-   else if (!pdev->use_ngg_streamout)
+
+      /* Invalidate L2 in case the buffer filled size needs to be saved because COPY_DATA isn't
+       * coherent with L2.
+       */
+      if (pdev->info.cp_sdma_ge_use_system_memory_scope) {
+         cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_INV_L2;
+         radv_emit_cache_flush(cmd_buffer);
+      }
+   } else if (!pdev->use_ngg_streamout) {
      radv_flush_vgt_streamout(cmd_buffer);
+   }

   ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs->b, MAX_SO_BUFFERS * 10);

--- a/src/amd/vulkan/radv_debug.c
+++ b/src/amd/vulkan/radv_debug.c
@ -390,8 +390,8 @@ static void
 radv_add_split_disasm(const char *disasm, uint64_t start_addr, unsigned *num, struct radv_shader_inst *instructions)
 {
   struct radv_shader_inst *last_inst = *num ? &instructions[*num - 1] : NULL;
-   char *next;
-   char *repeat = strstr(disasm, "then repeated");
+   const char *next;
+   const char *repeat = strstr(disasm, "then repeated");

   while ((next = strchr(disasm, '\n'))) {
      struct radv_shader_inst *inst = &instructions[*num];
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@ -786,6 +786,8 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd
      add_entrypoints(&b, &quantic_dream_device_entrypoints, RADV_APP_DISPATCH_TABLE);
   } else if (!strcmp(instance->drirc.debug.app_layer, "no_mans_sky")) {
      add_entrypoints(&b, &no_mans_sky_device_entrypoints, RADV_APP_DISPATCH_TABLE);
+   } else if (!strcmp(instance->drirc.debug.app_layer, "strange_brigade")) {
+      add_entrypoints(&b, &strange_brigade_device_entrypoints, RADV_APP_DISPATCH_TABLE);
   }

   if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
@ -1239,7 +1241,13 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr

   device->ws = pdev->ws;
   device->vk.sync = device->ws->get_sync_provider(device->ws);
-   device->vk.copy_sync_payloads = pdev->ws->copy_sync_payloads;
+
+   /* Disable unordered submits when SQTT queue events are enabled because queue present events
+    * might be missing otherwise.
+    */
+   device->vk.copy_sync_payloads = ((instance->vk.trace_mode & RADV_TRACE_MODE_RGP) && radv_sqtt_queue_events_enabled())
+                                      ? NULL
+                                      : pdev->ws->copy_sync_payloads;

   /* Enable the global BO list by default. */
   /* TODO: Remove the per cmdbuf BO list tracking after few Mesa releases if no blockers. */
--- a/src/amd/vulkan/radv_image_view.c
+++ b/src/amd/vulkan/radv_image_view.c
@ -500,9 +500,9 @@ radv_image_view_init(struct radv_image_view *iview, struct radv_device *device,

   if (!extra_create_info || !extra_create_info->from_client)
      assert(pCreateInfo->flags & VK_IMAGE_VIEW_CREATE_DRIVER_INTERNAL_BIT_MESA);
-   vk_image_view_init(&device->vk, &iview->vk, pCreateInfo);

-   memset(&iview->descriptor, 0, sizeof(iview->descriptor));
+   memset(iview, 0, sizeof(*iview));
+   vk_image_view_init(&device->vk, &iview->vk, pCreateInfo);

   iview->image = image;
   iview->plane_id = radv_plane_from_aspect(pCreateInfo->subresourceRange.aspectMask);
@ -664,13 +664,13 @@ radv_hiz_image_view_init(struct radv_image_view *iview, struct radv_device *devi
   VK_FROM_HANDLE(radv_image, image, pCreateInfo->image);

   assert(pCreateInfo->flags & VK_IMAGE_VIEW_CREATE_DRIVER_INTERNAL_BIT_MESA);
+
+   memset(iview, 0, sizeof(*iview));
   vk_image_view_init(&device->vk, &iview->vk, pCreateInfo);

   assert(vk_format_has_depth(image->vk.format) && vk_format_has_stencil(image->vk.format));
   assert(iview->vk.aspects == VK_IMAGE_ASPECT_DEPTH_BIT);

-   memset(&iview->descriptor, 0, sizeof(iview->descriptor));
-
   iview->image = image;

   const uint32_t type =
--- a/src/amd/vulkan/radv_pipeline_graphics.c
+++ b/src/amd/vulkan/radv_pipeline_graphics.c
@ -1662,7 +1662,7 @@ radv_graphics_shaders_link_varyings(struct radv_shader_stage *stages, enum amd_g

      /* Scalarize all I/O, because nir_opt_varyings and nir_opt_vectorize_io expect all I/O to be scalarized. */
      nir_variable_mode sca_mode = nir_var_shader_in;
-      bool sca_progress;
+      bool sca_progress = false;
      if (s != MESA_SHADER_FRAGMENT)
         sca_mode |= nir_var_shader_out;

--- a/src/amd/vulkan/radv_pipeline_rt.c
+++ b/src/amd/vulkan/radv_pipeline_rt.c
@ -674,7 +674,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
   bool can_use_monolithic = !library && pipeline->stage_count < 50;

   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
-      if (rt_stages[i].shader || rt_stages[i].nir)
+      if (rt_stages[i].nir)
         continue;

      int64_t stage_start = os_time_get_nano();
@ -749,7 +749,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
   inline_any_hit_shaders |= raygen_lowering_mode == RADV_RT_LOWERING_MODE_MONOLITHIC && !raygen_imported;

   for (uint32_t idx = 0; idx < pCreateInfo->stageCount; idx++) {
-      if (rt_stages[idx].shader || rt_stages[idx].nir)
+      if (rt_stages[idx].nir)
         continue;

      int64_t stage_start = os_time_get_nano();
@ -1462,17 +1462,39 @@ radv_GetRayTracingShaderGroupStackSizeKHR(VkDevice device, VkPipeline _pipeline,
   VK_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
   struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
   struct radv_ray_tracing_group *rt_group = &rt_pipeline->groups[group];
+
+   struct radv_ray_tracing_stage *shader_stage;
+
   switch (groupShader) {
   case VK_SHADER_GROUP_SHADER_GENERAL_KHR:
   case VK_SHADER_GROUP_SHADER_CLOSEST_HIT_KHR:
-      return rt_pipeline->stages[rt_group->recursive_shader].stack_size;
+      shader_stage = &rt_pipeline->stages[rt_group->recursive_shader];
+      break;
   case VK_SHADER_GROUP_SHADER_ANY_HIT_KHR:
-      return rt_pipeline->stages[rt_group->any_hit_shader].stack_size;
+      /* If the any-hit shader is inlined into an intersection shader, there is no stack specific to the any-hit shader
+       * and all stack will be allocated for the intersection shader instead.
+       */
+      if (rt_group->intersection_shader != VK_SHADER_UNUSED_KHR)
+         return 0;
+      shader_stage = &rt_pipeline->stages[rt_group->any_hit_shader];
+      break;
   case VK_SHADER_GROUP_SHADER_INTERSECTION_KHR:
-      return rt_pipeline->stages[rt_group->intersection_shader].stack_size;
+      shader_stage = &rt_pipeline->stages[rt_group->intersection_shader];
+      break;
   default:
      return 0;
   }
+
+   uint32_t stack_size = shader_stage->stack_size;
+   /* Applications need to allocate stack for the traversal shader, too. The API doesn't intend for a constant
+    * traversal stack size, so add the stack size to every shader potentially called by the traversal shader.
+    * Applications are expected to max() shader stages together, so this shouldn't result in any unnecessary stack
+    * usage.
+    */
+   if (shader_stage->stage == MESA_SHADER_CLOSEST_HIT || shader_stage->stage == MESA_SHADER_ANY_HIT ||
+       shader_stage->stage == MESA_SHADER_INTERSECTION || shader_stage->stage == MESA_SHADER_MISS)
+      stack_size += rt_pipeline->traversal_stack_size;
+   return stack_size;
 }

 VKAPI_ATTR VkResult VKAPI_CALL
--- a/src/amd/vulkan/radv_rra.c
+++ b/src/amd/vulkan/radv_rra.c
@ -790,7 +790,7 @@ rra_map_accel_struct_data(struct rra_copy_context *ctx, uint32_t i)
   if (radv_GetEventStatus(ctx->device, data->build_event) != VK_EVENT_SET)
      return NULL;

-   if (data->buffer->memory) {
+   if (data->buffer && data->buffer->memory) {
      VkMemoryMapInfo memory_map_info = {
         .sType = VK_STRUCTURE_TYPE_MEMORY_MAP_INFO,
         .memory = data->buffer->memory,
--- a/src/amd/vulkan/radv_sdma.c
+++ b/src/amd/vulkan/radv_sdma.c
@ -216,6 +216,7 @@ radv_sdma_get_surf(const struct radv_device *const device, const struct radv_ima
      .texel_scale = radv_sdma_get_texel_scale(image),
      .is_linear = surf->is_linear,
      .is_3d = surf->u.gfx9.resource_type == RADEON_RESOURCE_3D,
+      .is_stencil = subresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT,
   };

   const uint64_t surf_offset = (subresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) ? surf->u.gfx9.zs.stencil_offset
@ -371,6 +372,7 @@ radv_sdma_emit_copy_tiled_sub_window(const struct radv_device *device, struct ra
      .va = tiled->va,
      .format = radv_format_to_pipe_format(tiled->aspect_format),
      .bpp = tiled->bpp,
+      .is_stencil = tiled->is_stencil,
      .offset =
         {
            .x = tiled_off.x,
@ -414,6 +416,7 @@ radv_sdma_emit_copy_t2t_sub_window(const struct radv_device *device, struct radv
      .va = src->va,
      .format = radv_format_to_pipe_format(src->aspect_format),
      .bpp = src->bpp,
+      .is_stencil = src->is_stencil,
      .offset =
         {
            .x = src_off.x,
@ -439,6 +442,7 @@ radv_sdma_emit_copy_t2t_sub_window(const struct radv_device *device, struct radv
      .va = dst->va,
      .format = radv_format_to_pipe_format(dst->aspect_format),
      .bpp = dst->bpp,
+      .is_stencil = dst->is_stencil,
      .offset =
         {
            .x = dst_off.x,
@ -606,12 +610,6 @@ radv_sdma_use_t2t_scanline_copy(const struct radv_device *device, const struct r
         return true;
   }

-   /* The two images can have a different block size,
-    * but must have the same swizzle mode.
-    */
-   if (src->micro_tile_mode != dst->micro_tile_mode)
-      return true;
-
   /* The T2T subwindow copy packet only has fields for one metadata configuration.
    * It can either compress or decompress, or copy uncompressed images, but it
    * can't copy from a compressed image to another.
@ -619,6 +617,16 @@ radv_sdma_use_t2t_scanline_copy(const struct radv_device *device, const struct r
   if (src->is_compressed && dst->is_compressed)
      return true;

+   if (ver >= SDMA_7_0) {
+      /* No support for tiling format transformation at all. */
+      if (src->surf->u.gfx9.swizzle_mode != dst->surf->u.gfx9.swizzle_mode)
+         return true;
+   } else {
+      /* The two images can have a different block size, but must have the same swizzle mode. */
+      if (src->micro_tile_mode != dst->micro_tile_mode)
+         return true;
+   }
+
   const bool needs_3d_alignment = src->is_3d && (src->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY ||
                                                  src->micro_tile_mode == RADEON_MICRO_MODE_STANDARD);
   const unsigned log2bpp = util_logbase2(src->bpp);
--- a/src/amd/vulkan/radv_sdma.h
+++ b/src/amd/vulkan/radv_sdma.h
@ -31,6 +31,7 @@ struct radv_sdma_surf {
   uint8_t texel_scale;     /* Texel scale for 96-bit formats */
   bool is_linear;          /* Whether the image is linear. */
   bool is_3d;              /* Whether the image is 3-dimensional. */
+   bool is_stencil;         /* Whether the image is stencil only. */

   union {
      /* linear images only */
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -655,15 +655,24 @@ radv_shader_spirv_to_nir(struct radv_device *device, const struct radv_shader_st
      NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);
   }

+   bool lower_local_invocation_index = false;
+
+   if (nir->info.derivative_group == DERIVATIVE_GROUP_QUADS &&
+       ((nir->info.stage == MESA_SHADER_COMPUTE || nir->info.stage == MESA_SHADER_TASK ||
+         (nir->info.stage == MESA_SHADER_MESH && pdev->info.mesh_fast_launch_2)))) {
+      lower_local_invocation_index = true;
+   } else if (nir->info.stage == MESA_SHADER_COMPUTE &&
+              (((nir->info.workgroup_size[0] == 1) + (nir->info.workgroup_size[1] == 1) +
+                (nir->info.workgroup_size[2] == 1)) == 2)) {
+      lower_local_invocation_index = true;
+   }
+
   nir_lower_compute_system_values_options csv_options = {
      /* Mesh shaders run as NGG which can implement local_invocation_index from
       * the wave ID in merged_wave_info, but they don't have local_invocation_ids on GFX10.3.
       */
      .lower_cs_local_id_to_index = nir->info.stage == MESA_SHADER_MESH && !pdev->info.mesh_fast_launch_2,
-      .lower_local_invocation_index = nir->info.stage == MESA_SHADER_COMPUTE &&
-                                      ((((nir->info.workgroup_size[0] == 1) + (nir->info.workgroup_size[1] == 1) +
-                                         (nir->info.workgroup_size[2] == 1)) == 2) ||
-                                       nir->info.derivative_group == DERIVATIVE_GROUP_QUADS),
+      .lower_local_invocation_index = lower_local_invocation_index,
   };
   NIR_PASS(_, nir, nir_lower_compute_system_values, &csv_options);

--- a/src/amd/vulkan/radv_video.c
+++ b/src/amd/vulkan/radv_video.c
@ -950,8 +950,8 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
      struct VkVideoDecodeH265CapabilitiesKHR *ext =
         vk_find_struct(pCapabilities->pNext, VIDEO_DECODE_H265_CAPABILITIES_KHR);

-      pCapabilities->maxDpbSlots = RADV_VIDEO_H264_MAX_DPB_SLOTS;
-      pCapabilities->maxActiveReferencePictures = RADV_VIDEO_H264_MAX_NUM_REF_FRAME;
+      pCapabilities->maxDpbSlots = RADV_VIDEO_H265_MAX_DPB_SLOTS;
+      pCapabilities->maxActiveReferencePictures = RADV_VIDEO_H265_MAX_NUM_REF_FRAME;
      /* for h265 on navi21+ separate dpb images should work */
      if (radv_enable_tier2(pdev))
         pCapabilities->flags |= VK_VIDEO_CAPABILITY_SEPARATE_REFERENCE_IMAGES_BIT_KHR;
@ -2320,22 +2320,6 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v
   result.tx_mode = pi->TxMode;
   result.reference_mode = (pi->flags.reference_select == 1) ? 2 : 0;

-   if (pi->pTileInfo) {
-      result.tile_cols = pi->pTileInfo->TileCols;
-      result.tile_rows = pi->pTileInfo->TileRows;
-      result.tile_size_bytes = pi->pTileInfo->tile_size_bytes_minus_1;
-      result.context_update_tile_id = pi->pTileInfo->context_update_tile_id;
-
-      for (i = 0; i < result.tile_cols; i++)
-         result.tile_col_start_sb[i] = pi->pTileInfo->pMiColStarts[i];
-      result.tile_col_start_sb[result.tile_cols] =
-         result.tile_col_start_sb[result.tile_cols - 1] + pi->pTileInfo->pWidthInSbsMinus1[result.tile_cols - 1] + 1;
-      for (i = 0; i < pi->pTileInfo->TileRows; i++)
-         result.tile_row_start_sb[i] = pi->pTileInfo->pMiRowStarts[i];
-      result.tile_row_start_sb[result.tile_rows] =
-         result.tile_row_start_sb[result.tile_rows - 1] + pi->pTileInfo->pHeightInSbsMinus1[result.tile_rows - 1] + 1;
-   }
-
   result.max_width = seq_hdr->max_frame_width_minus_1 + 1;
   result.max_height = seq_hdr->max_frame_height_minus_1 + 1;
   VkExtent2D frameExtent = frame_info->dstPictureResource.codedExtent;
@ -2351,6 +2335,44 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v

   result.superres_upscaled_width = frameExtent.width;

+   if (pi->pTileInfo) {
+      result.tile_cols = pi->pTileInfo->TileCols;
+      result.tile_rows = pi->pTileInfo->TileRows;
+      result.tile_size_bytes = pi->pTileInfo->tile_size_bytes_minus_1;
+      result.context_update_tile_id = pi->pTileInfo->context_update_tile_id;
+
+      /* pMi{Row,Col}Starts is unreliable, some apps send SB, some send MI, so use
+       * p{Width,Height}InSbsMinus1 instead. But for uniform_tile_spacing_flag,
+       * those are not defined by spec. */
+      if (pi->pTileInfo->flags.uniform_tile_spacing_flag) {
+         const unsigned sb_size = seq_hdr->flags.use_128x128_superblock ? 128 : 64;
+         const unsigned sb_width = DIV_ROUND_UP(result.width, sb_size);
+         const unsigned sb_height = DIV_ROUND_UP(result.height, sb_size);
+         const unsigned tile_width_sb = DIV_ROUND_UP(sb_width, result.tile_cols);
+         const unsigned tile_height_sb = DIV_ROUND_UP(sb_height, result.tile_rows);
+
+         result.tile_col_start_sb[0] = 0;
+         for (i = 1; i < result.tile_cols; ++i)
+            result.tile_col_start_sb[i] = result.tile_col_start_sb[i - 1] + tile_width_sb;
+         result.tile_col_start_sb[i] = sb_width;
+
+         result.tile_row_start_sb[0] = 0;
+         for (i = 1; i < result.tile_rows; ++i)
+            result.tile_row_start_sb[i] = result.tile_row_start_sb[i - 1] + tile_height_sb;
+         result.tile_row_start_sb[i] = sb_height;
+      } else {
+         result.tile_col_start_sb[0] = 0;
+         assert(pi->pTileInfo->pMiColStarts[0] == 0);
+         for (i = 0; i < result.tile_cols; ++i)
+            result.tile_col_start_sb[i + 1] = result.tile_col_start_sb[i] + pi->pTileInfo->pWidthInSbsMinus1[i] + 1;
+
+         result.tile_row_start_sb[0] = 0;
+         assert(pi->pTileInfo->pMiRowStarts[0] == 0);
+         for (i = 0; i < result.tile_rows; ++i)
+            result.tile_row_start_sb[i + 1] = result.tile_row_start_sb[i] + pi->pTileInfo->pHeightInSbsMinus1[i] + 1;
+      }
+   }
+
   result.order_hint_bits = seq_hdr->order_hint_bits_minus_1 + 1;

   /* The VCN FW will evict references that aren't specified in
--- a/src/asahi/vulkan/hk_cmd_buffer.c
+++ b/src/asahi/vulkan/hk_cmd_buffer.c
@ -376,13 +376,15 @@ hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd,
    *
    * This means that, if some earlier set gets bound in such a way that
    * it changes set_dynamic_buffer_start[s], this binding is implicitly
-    * invalidated.  Therefore, we can always look at the current value
-    * of set_dynamic_buffer_start[s] as the base of our dynamic buffer
-    * range and it's only our responsibility to adjust all
-    * set_dynamic_buffer_start[p] for p > s as needed.
+    * invalidated.
    */
-   uint8_t dyn_buffer_start =
-      desc->root.set_dynamic_buffer_start[info->firstSet];
+   uint8_t dyn_buffer_start = 0u;
+   for (uint32_t i = 0u; i < info->firstSet; ++i) {
+      const struct hk_descriptor_set_layout *set_layout =
+         vk_to_hk_descriptor_set_layout(pipeline_layout->set_layouts[i]);
+      if (set_layout)
+         dyn_buffer_start += set_layout->dynamic_buffer_count;
+   }

   uint32_t next_dyn_offset = 0;
   for (uint32_t i = 0; i < info->descriptorSetCount; ++i) {
@ -427,10 +429,6 @@ hk_bind_descriptor_sets(UNUSED struct hk_cmd_buffer *cmd,
   assert(dyn_buffer_start <= HK_MAX_DYNAMIC_BUFFERS);
   assert(next_dyn_offset <= info->dynamicOffsetCount);

-   for (uint32_t s = info->firstSet + info->descriptorSetCount; s < HK_MAX_SETS;
-        s++)
-      desc->root.set_dynamic_buffer_start[s] = dyn_buffer_start;
-
   desc->root_dirty = true;
 }

--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@ -3212,6 +3212,9 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
   struct hk_graphics_state *gfx = &cmd->state.gfx;
   struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];

+   if (!IS_SHADER_DIRTY(VERTEX) && !IS_SHADER_DIRTY(GEOMETRY))
+      return;
+
   /* If there's an application geometry shader, there's nothing to un/bind */
   if (gs && !gs->is_passthrough)
      return;
@ -3221,20 +3224,17 @@ hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct agx_draw draw)
   uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
   bool needs_gs = xfb_outputs;

-   /* If we already have a matching GS configuration, we're done */
-   if ((gs != NULL) == needs_gs)
-      return;
-
   /* If we don't need a GS but we do have a passthrough, unbind it */
-   if (gs) {
-      assert(!needs_gs && gs->is_passthrough);
-      hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
+   if (!needs_gs) {
+      if (gs != NULL) {
+         assert(gs->is_passthrough);
+         hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
+      }
      return;
   }

   /* Else, we need to bind a passthrough GS */
-   size_t key_size =
-      sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs);
+   size_t key_size = hk_passthrough_gs_key_size(xfb_outputs);
   struct hk_passthrough_gs_key *key = alloca(key_size);

   *key = (struct hk_passthrough_gs_key){
--- a/src/asahi/vulkan/hk_cmd_meta.c
+++ b/src/asahi/vulkan/hk_cmd_meta.c
@ -1493,7 +1493,12 @@ hk_CmdFillBuffer(VkCommandBuffer commandBuffer, VkBuffer dstBuffer,
   uint64_t addr =
      vk_meta_buffer_address(&dev->vk, dstBuffer, dstOffset, dstRange);

-   libagx_fill(cmd, agx_1d(range / 4), AGX_BARRIER_ALL, addr, data);
+   if (util_is_aligned(addr, 16) && util_is_aligned(range, 16)) {
+      libagx_fill_uint4(cmd, agx_2d(range / 16, 1), AGX_BARRIER_ALL,
+                        addr, 0, data, data, data, data);
+   } else {
+      libagx_fill(cmd, agx_1d(range / 4), AGX_BARRIER_ALL, addr, data);
+   }
 }

 VKAPI_ATTR void VKAPI_CALL
--- a/src/asahi/vulkan/hk_shader.h
+++ b/src/asahi/vulkan/hk_shader.h
@ -387,8 +387,16 @@ struct hk_passthrough_gs_key {
   /* Decomposed primitive */
   enum mesa_prim prim;

-   /* Transform feedback info. Must add nir_xfb_info_size to get the key size */
+   /* Transform feedback info. Must use hk_passthrough_gs_key_size to get the
+    * key size */
   nir_xfb_info xfb_info;
 };

+static inline size_t
+hk_passthrough_gs_key_size(uint16_t output_count)
+{
+   return (sizeof(struct hk_passthrough_gs_key) - sizeof(nir_xfb_info)) +
+      nir_xfb_info_size(output_count);
+}
+
 void hk_nir_passthrough_gs(struct nir_builder *b, const void *key_);
--- a/src/broadcom/ci/broadcom-rpi3-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi3-fails.txt
@ -853,7 +853,6 @@ spec@!opengl 1.1@polygon-mode-offset@config 6: Expected blue pixel in center,Fai
 spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on right edge,Fail
 spec@!opengl 1.1@polygon-mode-offset@config 6: Expected white pixel on top edge,Fail

-spec@!opengl 1.1@texsubimage-unpack,Fail
 spec@!opengl 1.1@texwrap 2d proj,Fail
 spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- NPOT- projected,Fail
 spec@!opengl 1.1@texwrap 2d proj@GL_RGBA8- projected,Fail
@ -953,7 +952,6 @@ spec@arb_occlusion_query@occlusion_query_conform,Fail
 spec@arb_occlusion_query@occlusion_query_conform@GetObjivAval_multi2,Fail
 spec@arb_pixel_buffer_object@fbo-pbo-readpixels-small,Fail
 spec@arb_pixel_buffer_object@pbo-getteximage,Fail
-spec@arb_pixel_buffer_object@texsubimage-unpack pbo,Fail
 spec@arb_point_sprite@arb_point_sprite-mipmap,Fail
 spec@arb_provoking_vertex@arb-provoking-vertex-render,Fail
 spec@arb_sampler_objects@sampler-objects,Fail
--- a/src/broadcom/ci/broadcom-rpi4-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi4-fails.txt
@ -861,93 +861,6 @@ ubsan-dEQP-VK.image.mutable.2d_array.r16g16b16a16_sfloat_r16g16b16a16_uint_draw_
 ubsan-dEQP-VK.image.mutable.2d_array.r32_uint_r8g8b8a8_sint_draw_copy_resolve_mutable_color_att,Fail
 ubsan-dEQP-VK.pipeline.monolithic.logic_op_na_formats.r16g16_sfloat.nand_blend,Fail

-# New failures with ES CTS 3.2.13.0
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16ui_r16ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16ui_r16ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16f.renderbuffer_to_texture2d,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_texture2d,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_texture2d,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.srgb8_alpha8_srgb8_alpha8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8i_r8i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8i_r8i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8_r8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16f.renderbuffer_to_texture2d,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_texture2d,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_texture2d,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.srgb8_alpha8_srgb8_alpha8.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.renderbuffer_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8_r8.texture2d_to_renderbuffer,Fail
-arm32-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.renderbuffer_to_renderbuffer,Fail
-ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
-ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
-ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
-ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
-ubsan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
-
 # SKQP failing tests
 ES2BlendWithNoTexture,Fail
 SRGBReadWritePixels,Fail
--- a/src/broadcom/ci/broadcom-rpi5-fails.txt
+++ b/src/broadcom/ci/broadcom-rpi5-fails.txt
@ -701,84 +701,6 @@ dEQP-VK.binding_model.unused_invalid_descriptor.write.unused.storage_buffer,Cras
 dEQP-VK.binding_model.unused_invalid_descriptor.write.unused.uniform_buffer,Crash
 asan-dEQP-VK.binding_model.unused_invalid_descriptor.write.invalid.combined_image_sampler,Crash

-# New failures with ES CTS 3.2.13.0
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16f.renderbuffer_to_texture2d,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_texture2d,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.texture2d_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.renderbuffer_to_renderbuffer,Fail
-asan-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32i_rgba32i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_128_bits.rgba32ui_rgba32ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16i_r16i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16ui_r16ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.r16ui_r16ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8_rg8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8i_rg8i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_16_bits.rg8ui_rg8ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_24_bits.rgb8_rgb8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32i_r32i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.r32ui_r32ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16i_rg16i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rg16ui_rg16ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2_rgb10_a2.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16f.renderbuffer_to_texture2d,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16i.renderbuffer_to_texture2d,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rg16ui.renderbuffer_to_texture2d,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgb10_a2ui_rgb10_a2ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8_rgba8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8i_rgba8i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.rgba8ui_rgba8ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_32_bits.srgb8_alpha8_srgb8_alpha8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32i_rg32i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rg32ui_rg32ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16i_rgba16i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_64_bits.rgba16ui_rgba16ui.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8_r8.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8i_r8i.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8i_r8i.texture2d_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.renderbuffer_to_renderbuffer,Fail
-dEQP-GLES31.functional.copy_image.non_compressed.viewclass_8_bits.r8ui_r8ui.texture2d_to_renderbuffer,Fail
-
 # SKQP failing tests
 ES2BlendWithNoTexture,Fail
 SRGBReadWritePixels,Fail
--- a/src/broadcom/cle/v3d_packet.xml
+++ b/src/broadcom/cle/v3d_packet.xml
@ -1,4 +1,4 @@
-<vcxml gen="3.3" min_ver="42" max_ver="71">
+<vcxml gen="4.2" min_ver="42" max_ver="71">

  <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
    <value name="NEVER" value="0"/>
--- a/src/compiler/clc/clc.h
+++ b/src/compiler/clc/clc.h
@ -50,9 +50,12 @@ enum clc_spirv_version {
 };

 struct clc_optional_features {
+   bool atomic_order_seq_cst;
+   bool atomic_scope_device;
   bool extended_bit_ops;
   bool fp16;
   bool fp64;
+   bool generic_address_space;
   bool int64;
   bool images;
   bool images_depth;
--- a/src/compiler/clc/clc_helpers.cpp
+++ b/src/compiler/clc/clc_helpers.cpp
@ -28,8 +28,6 @@
 #include <sstream>
 #include <mutex>

-#include "util/ralloc.h"
-#include "util/set.h"
 #include <llvm/ADT/ArrayRef.h>
 #include <llvm/IR/DiagnosticPrinter.h>
 #include <llvm/IR/DiagnosticInfo.h>
@ -68,7 +66,17 @@
 #include <llvm/Support/VirtualFileSystem.h>
 #endif

+#if LLVM_VERSION_MAJOR >= 22
+#include <clang/Options/OptionUtils.h>
+#endif
+
+/* We have to include our own headers after LLVM/clang as they seem to use
+ * `UNUSED` within enum definitions:
+ * https://github.com/llvm/llvm-project/blob/ea443eeb2ab8ed49ffb783c2025fed6629a36f10/clang/include/clang/Basic/OffloadArch.h#L19
+ */
 #include "util/macros.h"
+#include "util/ralloc.h"
+#include "util/set.h"
 #include "util/u_dl.h"
 #include "glsl_types.h"

@ -915,7 +923,9 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
   // GetResourcePath is a way to retrieve the actual libclang resource dir based on a given binary
   // or library.
   auto tmp_res_path =
-#if LLVM_VERSION_MAJOR >= 20
+#if LLVM_VERSION_MAJOR >= 22
+      clang::GetResourcesPath(std::string(clang_path));
+#elif LLVM_VERSION_MAJOR >= 20
      Driver::GetResourcesPath(std::string(clang_path));
 #else
      Driver::GetResourcesPath(std::string(clang_path), CLANG_RESOURCE_DIR);
@ -959,6 +969,12 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
   c->getPreprocessorOpts().addMacroDef("cl_khr_expect_assume=1");

   bool needs_opencl_c_h = false;
+   if (args->features.atomic_order_seq_cst) {
+      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_atomic_order_seq_cst");
+   }
+   if (args->features.atomic_scope_device) {
+      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_atomic_scope_device");
+   }
   if (args->features.extended_bit_ops) {
      c->getPreprocessorOpts().addMacroDef("cl_khr_extended_bit_ops=1");
   }
@ -969,6 +985,9 @@ clc_compile_to_llvm_module(LLVMContext &llvm_ctx,
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+cl_khr_fp64");
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_fp64");
   }
+   if (args->features.generic_address_space) {
+      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_generic_address_space");
+   }
   if (args->features.int64) {
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+cles_khr_int64");
      c->getTargetOpts().OpenCLExtensionsAsWritten.push_back("+__opencl_c_int64");
--- a/src/compiler/clc/mesa_clc.c
+++ b/src/compiler/clc/mesa_clc.c
@ -134,6 +134,11 @@ main(int argc, char **argv)
         .args = util_dynarray_begin(&clang_args),
         .num_args = util_dynarray_num_elements(&clang_args, char *),
         .c_compatible = true,
+         .features = {
+            .atomic_order_seq_cst = true,
+            .atomic_scope_device = true,
+            .generic_address_space = true,
+         },
      };

      /* Enable all features, we don't know the target here and it is the
--- a/src/compiler/clc/nir_load_libclc.c
+++ b/src/compiler/clc/nir_load_libclc.c
@ -263,7 +263,7 @@ libclc_add_generic_variants(nir_shader *shader)
      if (strstr(func->name, "async_work_group_strided_copy"))
         continue;

-      char *U3AS1 = strstr(func->name, "U3AS1");
+      const char *U3AS1 = strstr(func->name, "U3AS1");
      if (U3AS1 == NULL)
         continue;

--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@ -7667,6 +7667,7 @@ ast_process_struct_or_iface_block_members(ir_exec_list *instructions,
       * embedded structures in 1.10 only.
       */
      if (state->language_version != 110 &&
+          !state->allow_glsl_embedded_structure_declarations &&
          decl_list->type->specifier->structure != NULL)
         _mesa_glsl_error(&loc, state,
                          "embedded structure declarations are not allowed");
--- a/src/compiler/glsl/gl_nir_linker.c
+++ b/src/compiler/glsl/gl_nir_linker.c
@ -1684,12 +1684,27 @@ cross_validate_globals(void *mem_ctx, const struct gl_constants *consts,
                     existing->data.mode == nir_var_mem_ssbo &&
                     existing->data.from_ssbo_unsized_array &&
                     glsl_get_gl_type(var->type) == glsl_get_gl_type(existing->type))) {
-                  linker_error(prog, "%s `%s' declared as type "
-                                 "`%s' and type `%s'\n",
-                                 gl_nir_mode_string(var),
-                                 var->name, glsl_get_type_name(var->type),
-                                 glsl_get_type_name(existing->type));
-                  return;
+
+                  /* Relax precision matching on unused uniforms for early ES shaders */
+                  if (prog->IsES && !var->interface_type &&
+                      !(existing->data.used && var->data.used) &&
+                      glsl_base_type_is_integer(glsl_get_gl_type(var->type)) == glsl_base_type_is_integer(glsl_get_gl_type(existing->type)) &&
+                      glsl_base_type_is_float(glsl_get_gl_type(var->type)) == glsl_base_type_is_float(glsl_get_gl_type(existing->type)) &&
+                      prog->GLSL_Version < 300) {
+                     linker_warning(prog, "%s `%s' declared as type "
+                                    "`%s' and type `%s'\n",
+                                    gl_nir_mode_string(var),
+                                    var->name, glsl_get_type_name(var->type),
+                                    glsl_get_type_name(existing->type));
+
+                  } else {
+                     linker_error(prog, "%s `%s' declared as type "
+                                    "`%s' and type `%s'\n",
+                                    gl_nir_mode_string(var),
+                                    var->name, glsl_get_type_name(var->type),
+                                    glsl_get_type_name(existing->type));
+                     return;
+                  }
               }
            }
         }
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@ -329,6 +329,8 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
      ctx->Const.AllowVertexTextureBias;
   this->allow_glsl_120_subset_in_110 =
      ctx->Const.AllowGLSL120SubsetIn110;
+   this->allow_glsl_embedded_structure_declarations =
+      ctx->Const.AllowGLSLEmbeddedStructureDeclarations;
   this->allow_builtin_variable_redeclaration =
      ctx->Const.AllowGLSLBuiltinVariableRedeclaration;
   this->ignore_write_to_readonly_var =
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@ -1023,6 +1023,7 @@ struct _mesa_glsl_parse_state {
   char *alias_shader_extension;
   bool allow_vertex_texture_bias;
   bool allow_glsl_120_subset_in_110;
+   bool allow_glsl_embedded_structure_declarations;
   bool allow_builtin_variable_redeclaration;
   bool ignore_write_to_readonly_var;

--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@ -676,6 +676,14 @@ glsl_type_is_e5m2(const glsl_type *t)
   return t->base_type == GLSL_TYPE_FLOAT_E5M2;
 }

+static inline bool
+glsl_type_is_nonnative_float(const glsl_type *t)
+{
+   return t->base_type == GLSL_TYPE_BFLOAT16 ||
+          t->base_type == GLSL_TYPE_FLOAT_E4M3FN ||
+          t->base_type == GLSL_TYPE_FLOAT_E5M2;
+}
+
 static inline bool
 glsl_type_is_int_16_32_64(const glsl_type *t)
 {
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@ -416,10 +416,9 @@ if with_tests
    nir_opt_algebraic_pattern_tests += static_library(
      'nir_opt_algebraic_pattern_test_@0@'.format(i),
      nir_opt_algebraic_pattern_test_cpp,
-      cpp_args : [cpp_msvc_compat_args, msvc_bigobj],
      override_options: [msvc_designated_initializer],
      gnu_symbol_visibility : 'hidden',
-      cpp_args : '-DSUBSET=@0@'.format(i),
+      cpp_args : [cpp_msvc_compat_args, msvc_bigobj, '-DSUBSET=@0@'.format(i)],
      include_directories : [inc_include, inc_src],
      dependencies : [dep_thread, idep_gtest, idep_nir, idep_mesautil],
    )
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@ -484,6 +484,8 @@ clone_call(clone_state *state, const nir_call_instr *call)

   for (unsigned i = 0; i < ncall->num_params; i++)
      __clone_src(state, ncall, &ncall->params[i], &call->params[i]);
+   if (call->indirect_callee.ssa)
+      __clone_src(state, ncall, &ncall->indirect_callee, &call->indirect_callee);

   return ncall;
 }
--- a/src/compiler/nir/nir_conversion_builder.h
+++ b/src/compiler/nir/nir_conversion_builder.h
@ -24,6 +24,7 @@
 #ifndef NIR_CONVERSION_BUILDER_H
 #define NIR_CONVERSION_BUILDER_H

+#include "util/half_float.h"
 #include "util/u_math.h"
 #include "nir_builder.h"
 #include "nir_builtin_builder.h"
@ -162,6 +163,29 @@ nir_round_int_to_float(nir_builder *b, nir_def *src,
      }
      UNREACHABLE("unexpected rounding mode");
   } else {
+      /* For conversions to FP16 we need to clamp the input against the fp16
+       * max value when rounding towards zero or down. The reason for that is
+       * that for integer values outside of FP16 finite value range we could
+       * get Infinity, which would be incorrect rounding in those cases.
+       *
+       * Furthermore, we only need to do the clamping for integers bigger than
+       * 32 bits, because the lowering below will already clamp 16 bit integers
+       * correctly.
+       *
+       * This isn't a problem for FP32 or FP64 floats as integers can't exceed
+       * the finite value ranges.
+       */
+      if (dest_bit_size == 16 && src->bit_size >= 32) {
+         switch (round) {
+         case nir_rounding_mode_rtz:
+         case nir_rounding_mode_rd:
+            src = nir_umin_imm(b, src, FP16_MAX_F);
+            break;
+         default:
+            break;
+         }
+      }
+
      nir_def *mantissa_bit_size = nir_imm_int(b, mantissa_bits);
      nir_def *msb = nir_imax(b, nir_ufind_msb(b, src), mantissa_bit_size);
      nir_def *bits_to_lose = nir_isub(b, msb, mantissa_bit_size);
@ -207,11 +231,6 @@ nir_alu_type_range_contains_type_range(nir_alu_type a, nir_alu_type b)
       a_bit_size > b_bit_size)
      return true;

-   /* 16-bit floats fit in 32-bit integers */
-   if (a_base_type == nir_type_int && a_bit_size >= 32 &&
-       b == nir_type_float16)
-      return true;
-
   /* All signed or unsigned ints can fit in float or above. A uint8 can fit
    * in a float16.
    */
@ -486,6 +505,15 @@ nir_convert_with_rounding(nir_builder *b,
   if (trivial_convert)
      return nir_type_convert(b, src, src_type, dest_type, round);

+   /* Nontrivial float conversions have special infinity handling when
+    * clamping, so we can't have fast math enabled.
+    */
+   unsigned old_fp_ctrl = b->fp_math_ctrl;
+
+   if (src_base_type == nir_type_float || dest_base_type == nir_type_float) {
+      b->fp_math_ctrl = nir_fp_no_fast_math;
+   }
+
   nir_def *dest = src;

   /* clamp the result into range */
@ -514,6 +542,7 @@ nir_convert_with_rounding(nir_builder *b,
   if (clamp_after_conversion)
      dest = nir_clamp_to_type_range(b, dest, dest_type, src, src_type, dest_type);

+   b->fp_math_ctrl = old_fp_ctrl;
   return dest;
 }

--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -1021,6 +1021,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_atest_pan:
   case nir_intrinsic_zs_emit_pan:
   case nir_intrinsic_load_return_param_amd:
+   case nir_intrinsic_load_local_invocation_index_intel:
      is_divergent = true;
      break;

--- a/src/compiler/nir/nir_fixup_is_exported.c
+++ b/src/compiler/nir/nir_fixup_is_exported.c
@ -48,6 +48,15 @@ nir_fixup_is_exported(nir_shader *nir)
   nir_foreach_function(func, nir) {
      if (_mesa_set_search(shadowed, func->name)) {
         func->is_exported = func->is_entrypoint;
+      } else {
+         /* Starting with LLVM-22 we don't see the wrappers anymore, so we
+          * can simply export every entrypoint.
+          *
+          * We could do an LLVM version check here, but that's going to be a
+          * mess making nir depending on LLVM in any way and this seems to work
+          * for both situations.
+          */
+         func->is_exported |= func->is_entrypoint;
      }

      if (func->name[0] == '_') {
--- a/src/compiler/nir/nir_functions.c
+++ b/src/compiler/nir/nir_functions.c
@ -22,10 +22,10 @@
 */

 #include "util/u_printf.h"
+#include "util/stack_array.h"
 #include "nir.h"
 #include "nir_builder.h"
 #include "nir_control_flow.h"
-#include "nir_vla.h"

 /*
 * TODO: write a proper inliner for GPUs.
@ -240,12 +240,13 @@ inline_functions_pass(nir_builder *b,
    * to an SSA value first.
    */
   const unsigned num_params = call->num_params;
-   NIR_VLA(nir_def *, params, num_params);
+   STACK_ARRAY(nir_def *, params, num_params);
   for (unsigned i = 0; i < num_params; i++) {
      params[i] = call->params[i].ssa;
   }

   nir_inline_function_impl(b, call->callee->impl, params, NULL);
+   STACK_ARRAY_FINISH(params);
   return true;
 }

--- a/src/compiler/nir/nir_gather_info.c
+++ b/src/compiler/nir/nir_gather_info.c
@ -850,6 +850,23 @@ gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader)
      shader->info.outputs_written |= BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
      break;

+   case nir_intrinsic_load_tile_pan:
+   case nir_intrinsic_load_tile_res_pan: {
+      const nir_io_semantics io = nir_intrinsic_io_semantics(instr);
+      shader->info.outputs_read |=
+         BITFIELD64_RANGE(io.location, io.num_slots);
+      break;
+   }
+
+   case nir_intrinsic_blend_pan:
+   case nir_intrinsic_blend2_pan:
+   case nir_intrinsic_store_tile_pan: {
+      const nir_io_semantics io = nir_intrinsic_io_semantics(instr);
+      shader->info.outputs_written |=
+         BITFIELD64_RANGE(io.location, io.num_slots);
+      break;
+   }
+
   case nir_intrinsic_demote_samples:
      shader->info.fs.uses_discard = true;
      break;
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@ -2633,6 +2633,9 @@ system_value("fs_msaa_intel", 1)
 # Per primitive remapping table offset.
 system_value("per_primitive_remap_intel", 1)

+# The (linear) local invocation index provided in the payload of mesh/task shaders.
+system_value("local_invocation_index_intel", 1)
+
 # Intrinsics for Intel bindless thread dispatch
 # BASE=brw_topoloy_id
 system_value("topology_id_intel", 1, indices=[BASE])
--- a/src/compiler/nir/nir_lower_bool_to_bitsize.c
+++ b/src/compiler/nir/nir_lower_bool_to_bitsize.c
@ -57,6 +57,32 @@ get_bool_convert_opcode(uint32_t dst_bit_size)
   }
 }

+static void
+resize_bool_alu_source(nir_builder *b, nir_alu_instr *alu,
+                       uint32_t src_idx, uint32_t bit_size)
+{
+   if (nir_src_bit_size(alu->src[src_idx].src) == bit_size)
+      return;
+
+   b->cursor = nir_before_instr(&alu->instr);
+   nir_op convert_op = get_bool_convert_opcode(bit_size);
+
+   /* Retain the number of components and swizzle of the original
+    * instruction so that we don’t unnecessarily create a vectorized
+    * instruction.
+    */
+   nir_def *new_src =
+      nir_build_alu1(b, convert_op, nir_ssa_for_alu_src(b, alu, src_idx));
+
+   nir_src_rewrite(&alu->src[src_idx].src, new_src);
+
+   /* The swizzle will have been handled by the conversion instruction
+    * so we can reset it back to the default
+    */
+   for (unsigned j = 0; j < NIR_MAX_VEC_COMPONENTS; j++)
+      alu->src[src_idx].swizzle[j] = j;
+}
+
 static void
 make_sources_canonical(nir_builder *b, nir_alu_instr *alu, uint32_t start_idx)
 {
@ -65,29 +91,8 @@ make_sources_canonical(nir_builder *b, nir_alu_instr *alu, uint32_t start_idx)
    */
   const nir_op_info *op_info = &nir_op_infos[alu->op];
   uint32_t bit_size = nir_src_bit_size(alu->src[start_idx].src);
-   for (uint32_t i = start_idx + 1; i < op_info->num_inputs; i++) {
-      if (nir_src_bit_size(alu->src[i].src) != bit_size) {
-         b->cursor = nir_before_instr(&alu->instr);
-         nir_op convert_op = get_bool_convert_opcode(bit_size);
-         nir_alu_instr *conv_instr = nir_alu_instr_create(b->shader, convert_op);
-         conv_instr->src[0].src = nir_src_for_ssa(alu->src[i].src.ssa);
-         /* Retain the write mask and swizzle of the original instruction so
-          * that we don’t unnecessarily create a vectorized instruction.
-          */
-         memcpy(conv_instr->src[0].swizzle,
-                alu->src[i].swizzle,
-                sizeof(conv_instr->src[0].swizzle));
-
-         nir_def *new_src = nir_builder_alu_instr_finish_and_insert(b, conv_instr);
-
-         nir_src_rewrite(&alu->src[i].src, new_src);
-         /* The swizzle will have been handled by the conversion instruction
-          * so we can reset it back to the default
-          */
-         for (unsigned j = 0; j < NIR_MAX_VEC_COMPONENTS; j++)
-            alu->src[i].swizzle[j] = j;
-      }
-   }
+   for (uint32_t i = start_idx + 1; i < op_info->num_inputs; i++)
+      resize_bool_alu_source(b, alu, i, bit_size);
 }

 static bool
@ -134,7 +139,9 @@ lower_alu_instr(nir_builder *b, nir_alu_instr *alu)
   case nir_op_bcsel:
      /* bcsel may be choosing between boolean sources too */
      if (alu->def.bit_size == 1)
-         make_sources_canonical(b, alu, 1);
+         make_sources_canonical(b, alu, 0);
+      else
+         resize_bool_alu_source(b, alu, 0, alu->def.bit_size);
      break;

   default:
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@ -696,7 +696,7 @@ if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
 binop("iadd", tint, _2src_commutative + associative, "(uint64_t)src0 + (uint64_t)src1")
 binop("iadd_sat", tint, _2src_commutative, """
      util_add_check_overflow({dest_type}, src0, src1) ?
-         (src1 < 0 ? u_intN_max(bit_size) : u_uintN_max(bit_size)) : (src0 + src1)
+         (src1 < 0 ? u_intN_min(bit_size) : u_intN_max(bit_size)) : (src0 + src1)
 """, "", True)
 binop("uadd_sat", tuint, _2src_commutative,
      "util_add_check_overflow({dest_type}, src0, src1) ? u_uintN_max(sizeof(src0) * 8) : (src0 + src1)",
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -783,8 +783,8 @@ optimizations.extend([

   (('bcsel(is_only_used_as_float)', ('feq', a, 'b(is_not_zero)'), b, a), a),
   (('bcsel(is_only_used_as_float)', ('fneu', a, 'b(is_not_zero)'), a, b), a),
-   (('bcsel', ignore_exact('feq', a, 0), 0, ('fsat', ('fmul', a, 'b(is_a_number)'))), ('fsat!', ('fmul', a, b))),
-   (('bcsel', ignore_exact('fneu', a, 0), ('fsat', ('fmul', a, 'b(is_a_number)')), 0), ('fsat!', ('fmul', a, b))),
+   (('bcsel', ignore_exact('feq', a, 0), 0, ('fsat', ('fmul', a, 'b(is_a_number)'))), ('!fsat', ('fmul', a, b))),
+   (('bcsel', ignore_exact('fneu', a, 0), ('fsat', ('fmul', a, 'b(is_a_number)')), 0), ('!fsat', ('fmul', a, b))),
   (('bcsel', ignore_exact('feq', a, 0), b, ('fadd', a, 'b(is_not_zero)')), ('fadd', a, b)),
   (('bcsel', ignore_exact('fneu', a, 0), ('fadd', a, 'b(is_not_zero)'), b), ('fadd', a, b)),

@ -2507,7 +2507,7 @@ optimizations.extend([
                             ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)),
                             ('iadd', a, b),
                             0x7fffffffffffffff)),
-    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0', TestStatus.XFAIL),
+    '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'),

   # int64_t sum = a - b;
   #
@ -2936,7 +2936,7 @@ for bit_size in [8, 16, 32, 64]:
   optimizations += [
      (('iadd_sat@' + str(bit_size), a, b),
       ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)),
-                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat', TestStatus.XFAIL if bit_size in [8, 64] else TestStatus.PASS),
+                                ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'),
      (('isub_sat@' + str(bit_size), a, b),
       ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)),
                                ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'),
@ -3910,7 +3910,7 @@ late_optimizations.extend([

   # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c),
   # op(b, d)) => op(b, bcsel(a, c, d)) transformations.  I do not know why.
-   (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
+   (('bcsel@32', ('feq', ('fsqrt', 'a(is_a_number_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)),
    ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))),

   # Things that look like DPH in the source shader may get expanded to
--- a/src/compiler/nir/nir_opt_load_store_vectorize.c
+++ b/src/compiler/nir/nir_opt_load_store_vectorize.c
@ -821,7 +821,7 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
   unsigned high_offset = get_offset_diff(low, high);

   /* This can cause issues when combining store data. */
-   if (high_offset % (new_bit_size / 8) != 0)
+   if (low->is_store && (high_offset % (new_bit_size / 8) != 0))
      return false;

   /* check nir_extract_bits limitations */
--- a/src/compiler/nir/nir_range_analysis.c
+++ b/src/compiler/nir/nir_range_analysis.c
@ -2197,6 +2197,7 @@ nir_unsigned_upper_bound(nir_shader *shader, struct hash_table *range_ht,

   push_scalar_query(&state, scalar);

+   _mesa_hash_table_set_deleted_key(range_ht, (void *)(uintptr_t)UINT32_MAX);
   return perform_analysis(&state);
 }

@ -2588,5 +2589,6 @@ nir_def_num_lsb_zero(struct hash_table *numlsb_ht, nir_scalar def)

   push_scalar_query(&state, def);

+   _mesa_hash_table_set_deleted_key(numlsb_ht, (void *)(uintptr_t)UINT32_MAX);
   return perform_analysis(&state);
 }
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@ -27,7 +27,6 @@

 #include "glsl_types.h"
 #include "vtn_private.h"
-#include "nir/nir_vla.h"
 #include "nir/nir_control_flow.h"
 #include "nir/nir_constant_expressions.h"
 #include "nir/nir_deref.h"
@ -42,6 +41,7 @@
 #include "util/mesa-blake3.h"
 #include "util/bfloat.h"
 #include "util/float8.h"
+#include "util/stack_array.h"

 #include <stdio.h>

@ -1404,7 +1404,7 @@ vtn_type_get_nir_type(struct vtn_builder *b, struct vtn_type *type,
      case vtn_base_type_struct: {
         bool need_new_struct = false;
         const uint32_t num_fields = type->length;
-         NIR_VLA(struct glsl_struct_field, fields, num_fields);
+         STACK_ARRAY(struct glsl_struct_field, fields, num_fields);
         for (unsigned i = 0; i < num_fields; i++) {
            fields[i] = *glsl_get_struct_field_data(type->type, i);
            const struct glsl_type *field_nir_type =
@ -1414,20 +1414,25 @@ vtn_type_get_nir_type(struct vtn_builder *b, struct vtn_type *type,
               need_new_struct = true;
            }
         }
+
+         const struct glsl_type *result;
         if (need_new_struct) {
            if (glsl_type_is_interface(type->type)) {
-               return glsl_interface_type(fields, num_fields,
-                                          /* packing */ 0, false,
-                                          glsl_get_type_name(type->type));
+               result = glsl_interface_type(fields, num_fields,
+                                            /* packing */ 0, false,
+                                            glsl_get_type_name(type->type));
            } else {
-               return glsl_struct_type(fields, num_fields,
-                                       glsl_get_type_name(type->type),
-                                       glsl_struct_type_is_packed(type->type));
+               result = glsl_struct_type(fields, num_fields,
+                                         glsl_get_type_name(type->type),
+                                         glsl_struct_type_is_packed(type->type));
            }
         } else {
            /* No changes, just pass it on */
-            return type->type;
+            result = type->type;
         }
+
+         STACK_ARRAY_FINISH(fields);
+         return result;
      }

      case vtn_base_type_image:
@ -2073,7 +2078,7 @@ vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
      val->type->offsets = vtn_alloc_array(b, unsigned, num_fields);
      val->type->packed = false;

-      NIR_VLA(struct glsl_struct_field, fields, count);
+      STACK_ARRAY(struct glsl_struct_field, fields, count);
      for (unsigned i = 0; i < num_fields; i++) {
         val->type->members[i] = vtn_get_type(b, w[i + 2]);
         const char *name = NULL;
@ -2129,6 +2134,8 @@ vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
                                            name ? name : "struct",
                                            val->type->packed);
      }
+
+      STACK_ARRAY_FINISH(fields);
      break;
   }

@ -2858,60 +2865,66 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
      default: {
         bool swap;

-         const glsl_type *org_dst_type = val->type->type;
-         const glsl_type *org_src_type = org_dst_type;
+         const glsl_type *dst_type = val->type->type;

         const bool saturate = vtn_has_decoration(b, val, SpvDecorationSaturatedToLargestFloat8NormalConversionEXT);
         unsigned num_components = glsl_get_vector_elements(val->type->type);

         vtn_assert(count <= 7);

+         const unsigned src_count = count - 4;
+         struct vtn_value *src_val[3] = {0};
+         const glsl_type *src_type[3] = {0};
+
+         for (unsigned i = 0; i < src_count; i++) {
+            src_val[i] = vtn_value(b, w[4 + i], vtn_value_type_constant);
+            src_type[i] = src_val[i]->type->type;
+         }
+
+         unsigned conv_src_bit_size;
         switch (opcode) {
+         case SpvOpConvertFToU:
+         case SpvOpConvertFToS:
+         case SpvOpConvertSToF:
+         case SpvOpConvertUToF:
         case SpvOpSConvert:
         case SpvOpFConvert:
         case SpvOpUConvert:
            /* We have a different source type in a conversion. */
-            org_src_type = vtn_get_value_type(b, w[4])->type;
+            conv_src_bit_size =
+               glsl_type_is_nonnative_float(src_type[0]) ? 32 : glsl_get_bit_size(src_type[0]);
            break;
         default:
+            /* When picking ALU ops, bit-size is only used for Convert
+             * operations.
+             */
+            conv_src_bit_size = 0;
            break;
         };

-         const glsl_type *dst_type = org_dst_type;
-         if (glsl_type_is_bfloat_16(dst_type) || glsl_type_is_e4m3fn(dst_type) || glsl_type_is_e5m2(dst_type))
-            dst_type = glsl_float_type();
-
-         const glsl_type *src_type = org_src_type;
-         if (glsl_type_is_bfloat_16(src_type) || glsl_type_is_e4m3fn(src_type) || glsl_type_is_e5m2(src_type))
-            src_type = glsl_float_type();
+         const unsigned dst_bit_size =
+            glsl_type_is_nonnative_float(dst_type) ? 32 : glsl_get_bit_size(dst_type);

         bool exact;
         nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &swap, &exact,
-                                                     src_type, dst_type);
+                                                     conv_src_bit_size, dst_bit_size);

         /* No SPIR-V opcodes handled through this path should set exact.
          * Since it is ignored, assert on it.
          */
         assert(!exact);

-         unsigned bit_size = glsl_get_bit_size(dst_type);
+         unsigned resolved_bit_size = dst_bit_size;
+
         nir_const_value src[3][NIR_MAX_VEC_COMPONENTS];

-         for (unsigned i = 0; i < count - 4; i++) {
-            struct vtn_value *src_val =
-               vtn_value(b, w[4 + i], vtn_value_type_constant);
-
+         for (unsigned i = 0; i < src_count; i++) {
            /* If this is an unsized source, pull the bit size from the
             * source; otherwise, we'll use the bit size from the destination.
             */
            if (!nir_alu_type_get_type_size(nir_op_infos[op].input_types[i])) {
-               if (org_src_type != src_type) {
-                  /* Small float conversion. */
-                  assert(i == 0);
-                  bit_size = glsl_get_bit_size(src_type);
-               } else {
-                  bit_size = glsl_get_bit_size(src_val->type->type);
-               }
+               resolved_bit_size = glsl_type_is_nonnative_float(src_type[i]) ?
+                  32 : glsl_get_bit_size(src_type[i]);
            }

            unsigned src_comps = nir_op_infos[op].input_sizes[i] ?
@ -2920,53 +2933,55 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,

            unsigned j = swap ? 1 - i : i;
            for (unsigned c = 0; c < src_comps; c++) {
-               src[j][c] = src_val->constant->values[c];
-               if (glsl_type_is_bfloat_16(org_src_type))
+               src[j][c] = src_val[i]->constant->values[c];
+               if (glsl_type_is_bfloat_16(src_type[i]))
                  src[j][c].f32 = _mesa_bfloat16_bits_to_float(src[j][c].u16);
-               else if (glsl_type_is_e4m3fn(org_src_type))
+               else if (glsl_type_is_e4m3fn(src_type[i]))
                  src[j][c].f32 = _mesa_e4m3fn_to_float(src[j][c].u8);
-               else if (glsl_type_is_e5m2(org_src_type))
+               else if (glsl_type_is_e5m2(src_type[i]))
                  src[j][c].f32 = _mesa_e5m2_to_float(src[j][c].u8);
            }
-         }

-         /* fix up fixed size sources */
-         switch (op) {
-         case nir_op_ishl:
-         case nir_op_ishr:
-         case nir_op_ushr: {
-            if (bit_size == 32)
-               break;
-            for (unsigned i = 0; i < num_components; ++i) {
-               switch (bit_size) {
-               case 64: src[1][i].u32 = src[1][i].u64; break;
-               case 16: src[1][i].u32 = src[1][i].u16; break;
-               case  8: src[1][i].u32 = src[1][i].u8;  break;
+            /* Fix up source to respect NIR expected sizes. */
+            switch (op) {
+            case nir_op_ishl:
+            case nir_op_ishr:
+            case nir_op_ushr: {
+               /* Shift amount in NIR ops must be 32-bit. */
+               vtn_assert(!swap);
+               const unsigned shift_idx = 1;
+               const unsigned shift_bit_size = glsl_get_bit_size(src_type[i]);
+               if (i != shift_idx || shift_bit_size == 32)
+                  break;
+               for (unsigned c = 0; c < src_comps; c++) {
+                  nir_const_value *shift = &src[shift_idx][c];
+                  *shift = nir_const_value_for_uint(
+                        nir_const_value_as_uint(*shift, shift_bit_size), 32);
               }
+               break;
+            }
+            default:
+               break;
            }
-            break;
-         }
-         default:
-            break;
         }

         nir_const_value *srcs[3] = {
            src[0], src[1], src[2],
         };
         nir_eval_const_opcode(op, val->constant->values, NULL,
-                               num_components, bit_size, srcs,
+                               num_components, resolved_bit_size, srcs,
                               b->shader->info.float_controls_execution_mode);

         for (int i = 0; i < num_components; i++) {
            uint16_t conv;
-            if (glsl_type_is_bfloat_16(org_dst_type)) {
+            if (glsl_type_is_bfloat_16(dst_type)) {
               conv = _mesa_float_to_bfloat16_bits_rte(val->constant->values[i].f32);
-            } else if (glsl_type_is_e4m3fn(org_dst_type)) {
+            } else if (glsl_type_is_e4m3fn(dst_type)) {
               if (saturate)
                  conv = _mesa_float_to_e4m3fn_sat(val->constant->values[i].f32);
               else
                  conv = _mesa_float_to_e4m3fn(val->constant->values[i].f32);
-            } else if (glsl_type_is_e5m2(org_dst_type)) {
+            } else if (glsl_type_is_e5m2(dst_type)) {
               if (saturate)
                  conv = _mesa_float_to_e5m2_sat(val->constant->values[i].f32);
               else
@ -2975,7 +2990,7 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
               continue;
            }

-            val->constant->values[i] = nir_const_value_for_raw_uint(conv, glsl_get_bit_size(org_dst_type));
+            val->constant->values[i] = nir_const_value_for_raw_uint(conv, glsl_get_bit_size(dst_type));
         }

         break;
@ -5248,6 +5263,12 @@ vtn_handle_entry_point(struct vtn_builder *b, const uint32_t *w,
   b->interface_ids = vtn_alloc_array(b, uint32_t, b->interface_ids_count);
   memcpy(b->interface_ids, &w[start], b->interface_ids_count * 4);
   qsort(b->interface_ids, b->interface_ids_count, 4, cmp_uint32_t);
+
+   if (stage == MESA_SHADER_KERNEL) {
+      b->fp_math_ctrl_fp16 |= nir_fp_preserve_sz_inf_nan;
+      b->fp_math_ctrl_fp32 |= nir_fp_preserve_sz_inf_nan;
+      b->fp_math_ctrl_fp64 |= nir_fp_preserve_sz_inf_nan;
+   }
 }

 static bool
--- a/src/compiler/spirv/vtn_alu.c
+++ b/src/compiler/spirv/vtn_alu.c
@ -280,12 +280,9 @@ vtn_convert_op_dst_type(SpvOp opcode)
 nir_op
 vtn_nir_alu_op_for_spirv_opcode(struct vtn_builder *b,
                                SpvOp opcode, bool *swap, bool *exact,
-                                const glsl_type *src_type,
-                                const glsl_type *dst_type)
+                                unsigned conv_src_bit_size,
+                                unsigned conv_dst_bit_size)
 {
-   const unsigned src_bit_size = glsl_get_bit_size(src_type);
-   const unsigned dst_bit_size = glsl_get_bit_size(dst_type);
-
   /* Indicates that the first two arguments should be swapped.  This is
    * used for implementing greater-than and less-than-or-equal.
    */
@ -382,8 +379,12 @@ vtn_nir_alu_op_for_spirv_opcode(struct vtn_builder *b,
   case SpvOpConvertUToF:
   case SpvOpSConvert:
   case SpvOpFConvert: {
-      nir_alu_type src_type = vtn_convert_op_src_type(opcode) | src_bit_size;
-      nir_alu_type dst_type = vtn_convert_op_dst_type(opcode) | dst_bit_size;
+      vtn_fail_if(conv_src_bit_size == 0,
+                  "Need src bit_size to translate from SPIR-V convert opcodes to NIR.");
+      vtn_fail_if(conv_dst_bit_size == 0,
+                  "Need dst bit_size to translate from SPIR-V convert opcodes to NIR.");
+      nir_alu_type src_type = vtn_convert_op_src_type(opcode) | conv_src_bit_size;
+      nir_alu_type dst_type = vtn_convert_op_dst_type(opcode) | conv_dst_bit_size;
      return nir_type_conversion_op(src_type, dst_type, nir_rounding_mode_undef);
   }

@ -909,8 +910,7 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
      bool swap;
      bool unused_exact;
      nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &swap,
-                                                  &unused_exact,
-                                                  vtn_src[0]->type, dest_type);
+                                                  &unused_exact, 0, 0);

      if (swap) {
         nir_def *tmp = src[0];
@ -986,8 +986,7 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
   case SpvOpShiftRightLogical: {
      bool swap;
      bool exact;
-      nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &swap, &exact,
-                                                  vtn_src[0]->type, dest_type);
+      nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &swap, &exact, 0, 0);

      assert(!exact);

@ -1046,7 +1045,8 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
      bool exact;
      nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &swap,
                                                  &exact,
-                                                  vtn_src[0]->type, dest_type);
+                                                  glsl_get_bit_size(vtn_src[0]->type),
+                                                  glsl_get_bit_size(dest_type));

      if (swap) {
         nir_def *tmp = src[0];
--- a/src/compiler/spirv/vtn_cmat.c
+++ b/src/compiler/spirv/vtn_cmat.c
@ -320,9 +320,7 @@ vtn_handle_cooperative_alu(struct vtn_builder *b, struct vtn_value *dest_val,
         nir_deref_instr *src = vtn_get_cmat_deref(b, w[3]);

         bool ignored = false;
-         nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &ignored, &ignored,
-                                                     glsl_get_cmat_element(src->type),
-                                                     glsl_get_cmat_element(dst_type->type));
+         nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &ignored, &ignored, 0, 0);

         nir_deref_instr *dst = vtn_create_cmat_temporary(b, dst_type->type, "cmat_unary");
         nir_cmat_unary_op(&b->nb, &dst->def, &src->def,
@ -346,9 +344,7 @@ vtn_handle_cooperative_alu(struct vtn_builder *b, struct vtn_value *dest_val,
         nir_deref_instr *mat_a = vtn_get_cmat_deref(b, w[3]);
         nir_deref_instr *mat_b = vtn_get_cmat_deref(b, w[4]);

-         nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &ignored, &ignored,
-                                                     glsl_get_cmat_element(mat_a->type),
-                                                     glsl_get_cmat_element(dst_type->type));
+         nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &ignored, &ignored, 0, 0);

         nir_deref_instr *dst = vtn_create_cmat_temporary(b, dst_type->type, "cmat_binary");
         nir_cmat_binary_op(&b->nb, &dst->def, &mat_a->def, &mat_b->def,
--- a/src/compiler/spirv/vtn_opencl.c
+++ b/src/compiler/spirv/vtn_opencl.c
@ -725,6 +725,13 @@ handle_special(struct vtn_builder *b, uint32_t opcode,
   if (!ret)
      vtn_fail("No NIR equivalent");

+   /* libclc's cbrt() implementation fails to flush subnormal numbers to zero
+    * even when flush-to-zero is required. Manually flush its output.
+    */
+   if (opcode == OpenCLstd_Cbrt) {
+      ret = nir_fcanonicalize(nb, ret);
+   }
+
   return ret;
 }

--- a/Show more
+++ b/Show more