VERSION: bump for 25.3.1

Signed-off-by: Dylan Baker <dylan.c.baker@intel.com>
docs: add release notes for 25.3.1
2025-12-20 05:10:11 +01:00 · 2025-12-03 22:02:48 -08:00 · 2025-12-03 22:02:09 -08:00 · 2025-12-03 15:02:48 -08:00 · 2025-12-03 22:24:03 +00:00 · 2025-12-03 12:31:36 -08:00
298 changed files with 28402 additions and 2825 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -41,7 +41,7 @@ workflow:
        FDO_RUNNER_JOB_PRIORITY_TAG_X86_64: priority:high
        FDO_RUNNER_JOB_PRIORITY_TAG_X86_64_KVM: priority:high-kvm
        FDO_RUNNER_JOB_PRIORITY_TAG_AARCH64: priority:high-aarch64
-        CI_TRON_JOB_PRIORITY_TAG: ""  # Empty tags are ignored by gitlab
+        CI_TRON_JOB_PRIORITY: high
        JOB_PRIORITY: 75
        # fast-fail in merge pipelines: stop early if we get this many unexpected fails/crashes
        DEQP_RUNNER_MAX_FAILS: 40
@ -61,6 +61,7 @@ workflow:
        FDO_RUNNER_JOB_PRIORITY_TAG_X86_64: priority:low
        FDO_RUNNER_JOB_PRIORITY_TAG_X86_64_KVM: priority:low-kvm
        FDO_RUNNER_JOB_PRIORITY_TAG_AARCH64: priority:low-aarch64
+        CI_TRON_JOB_PRIORITY: low
        JOB_PRIORITY: 45
        # (some) nightly builds perform LTO, so they take much longer than the
        # short timeout allowed in other pipelines.
@ -123,7 +124,7 @@ variables:
  FDO_RUNNER_JOB_PRIORITY_TAG_X86_64: ""  # Empty tags are ignored by gitlab
  FDO_RUNNER_JOB_PRIORITY_TAG_X86_64_KVM: kvm
  FDO_RUNNER_JOB_PRIORITY_TAG_AARCH64: aarch64
-  CI_TRON_JOB_PRIORITY_TAG: ci-tron:priority:low
+  CI_TRON_JOB_PRIORITY: default
  JOB_PRIORITY: 50
  DATA_STORAGE_PATH: data_storage
  KERNEL_IMAGE_BASE: "https://$S3_HOST/$S3_KERNEL_BUCKET/$KERNEL_REPO/$KERNEL_TAG"
@ -422,4 +423,3 @@ sanity:
    when: on_failure
    reports:
      junit: check-*.xml
-
--- a/.gitlab-ci/ci-tron/gitlab-ci.yml
+++ b/.gitlab-ci/ci-tron/gitlab-ci.yml
@ -40,6 +40,7 @@
    - !reference [.required-for-hardware-jobs, needs]
  tags:
    - farm:$RUNNER_FARM_LOCATION
+    - ci-tron:priority:$CI_TRON_JOB_PRIORITY
    - $CI_TRON_DUT_SETUP_TAGS

  # Override the default before_script, as it is not compatible with the CI-tron environment. We just keep the clearing
--- a/.gitlab-ci/farm-rules.yml
+++ b/.gitlab-ci/farm-rules.yml
@ -34,7 +34,6 @@
 #   anholt           | (decommissioned)                 | @anholt
 #   austriancoder    | ci-tron                          | @austriancoder
 #   collabora        | lava                             | @daniels, @sergi
-#   google-freedreno | none (moving to LAVA)            | @daniels, @sergi
 #   igalia           | baremetal/poe-powered, ci-tron   | @jasuarez, @chema
 #   lima             | lava                             | @enunes
 #   microsoft        | custom                           | @jenatali, @alatiera
@ -293,15 +292,6 @@
    - !reference [.pengutronix-farm-rules, rules]


-# Temporary placeholder as the devices move across to LAVA.
-.google-freedreno-farm-rules:
-  rules:
-    - when: never
-
-.google-freedreno-farm-manual-rules:
-  rules:
-    - when: never
-
 # Skip container & build jobs when disabling any farm, and run them if any
 # farm gets re-enabled.
 # Only apply these rules in MR context, because otherwise we get a false
--- a/.gitlab-ci/report-flakes.py
+++ b/.gitlab-ci/report-flakes.py
@ -118,7 +118,6 @@ def main():
    # before we make it to 9-digit jobs (we're at 7 so far).
    nick = args.runner
    nick = nick.replace('mesa-', '')
-    nick = nick.replace('google-freedreno-', '')
    nick += f'-{args.job}'
    irc.send_line(f"NICK {nick}")
    irc.send_line(f"USER {nick} unused unused: Gitlab CI Notifier")
--- a/.gitlab-ci/test-source-dep.yml
+++ b/.gitlab-ci/test-source-dep.yml
@ -60,6 +60,8 @@
        - subprojects/**/*
        - .gitattributes
        - src/*
+        - src/android_stub/**/*
+        - src/c11/**/*
        - src/compiler/**/*
        - src/drm-shim/**/*
        - src/gtest/**/*
--- a/.gitlab-ci/windows/mesa_deps_packages.ps1
+++ b/.gitlab-ci/windows/mesa_deps_packages.ps1
@ -36,22 +36,23 @@ $MACHINE_PATH=[System.Environment]::GetEnvironmentVariable('PATH', [System.Envir
 Write-Output "Before winget install USER_PATH:$USER_PATH MACHINE_PATH:$MACHINE_PATH"

 $Packages = @(
-  'Microsoft.WindowsWDK.10.0.26100',
-  'Python.Python.3.13',
-  'Ninja-build.Ninja',
-  'Kitware.CMake',
-  'Git.Git',
-  'WinFlexBison.win_flex_bison',
-  'bloodrock.pkg-config-lite'
+  'Microsoft.WindowsWDK.10.0.26100,10.1.26100.6584',
+  'Python.Python.3.13,3.13.9',
+  'Ninja-build.Ninja,1.13.1',
+  'Kitware.CMake,4.1.3',
+  'Git.Git,2.52.0',
+  'WinFlexBison.win_flex_bison,2.5.24',
+  'bloodrock.pkg-config-lite,0.28-1'
 )

 $ProgressPreference = "SilentlyContinue"
 New-Item -Force -ItemType 'directory' -Name 'flexbison' -Path 'C:\temp'
 foreach ($package in $Packages)
 {
-  Write-Output "Installing $package with winget"
+  $package_id, $package_version = $package -split ',', 2
+  Write-Output "Installing $package_id with version $package_version by winget"
  For ($i = 0; $i -lt 5; $i++) {
-    winget install --verbose --silent --accept-package-agreements --source winget --exact --id $package --log C:\temp\wdk-install.log
+    winget install --verbose --silent --accept-package-agreements --source winget --exact --id $package_id --version $package_version --log C:\temp\wdk-install.log
    $packages_installed = $?
    if ($packages_installed) {
      Break
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@ -1 +1 @@
-25.3.0-devel
+25.3.1
--- a/docs/ci/LAVA.rst
+++ b/docs/ci/LAVA.rst
@ -122,9 +122,8 @@ Enable the site and restart nginx:
   # Second download should be cached.
   wget http://localhost/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public/itoral-gl-terrain-demo/demo-v2.trace

-Now, set ``download-url`` in your ``traces-*.yml`` entry to something like
-``http://caching-proxy/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public``
-and you should have cached downloads for traces.  Add it to
-``FDO_HTTP_CACHE_URI=`` in your ``config.toml`` runner environment lines and you
-can use it for cached artifact downloads instead of going all the way to
-freedesktop.org on each job.
+The trace runner script automatically sets the caching proxy, so there's no
+need to modify anything in the Mesa CI YAML files.
+Add ``LAVA_HTTP_CACHE_URI=http://localhost/cache/?uri=`` to your ``config.toml``
+runner environment lines and you can use it for cached artifact downloads
+instead of going all the way to freedesktop.org on each job.
--- a/docs/relnotes.rst
+++ b/docs/relnotes.rst
@ -3,6 +3,8 @@ Release Notes

 The release notes summarize what's new or changed in each Mesa release.

+-  :doc:`25.3.1 release notes <relnotes/25.3.1>`
+-  :doc:`25.3.0 release notes <relnotes/25.3.0>`
 -  :doc:`25.2.5 release notes <relnotes/25.2.5>`
 -  :doc:`25.2.4 release notes <relnotes/25.2.4>`
 -  :doc:`25.2.3 release notes <relnotes/25.2.3>`
@ -466,6 +468,8 @@ The release notes summarize what's new or changed in each Mesa release.
   :maxdepth: 1
   :hidden:

+   25.3.1 <relnotes/25.3.1>
+   25.3.0 <relnotes/25.3.0>
   25.2.5 <relnotes/25.2.5>
   25.2.4 <relnotes/25.2.4>
   25.2.3 <relnotes/25.2.3>
--- a/docs/relnotes/25.3.0.rst
+++ b/docs/relnotes/25.3.0.rst
--- a/docs/relnotes/25.3.1.rst
+++ b/docs/relnotes/25.3.1.rst
@ -0,0 +1,278 @@
+Mesa 25.3.1 Release Notes / 2025-12-03
+======================================
+
+Mesa 25.3.1 is a bug fix release which fixes bugs found since the 25.3.0 release.
+
+Mesa 25.3.1 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 25.3.1 implements the Vulkan 1.4 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA checksums
+-------------
+
+::
+
+    TBD.
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- venus: synchronization tests sometimes get stuck in semaphore/fence wait
+- [ANV][LNL] - Detroit: Become Human (1222140) - Flickering horizontal artifacts across the screen
+- [ANV][LNL] - Eternal Strands (1491410) - Colorful graphical aberrations are present whenever a 3D asset is visible.
+- [ANV][DG2/LNL] SolarBay extreme RT regression
+- After 25.3 update some app windows became glitchy on uhd 620
+- FurMark gets glitchi graphics when using Vulkan API on UHD 620 (mesa 25.2.6 and 26.0)
+- tu: resource leak
+- Regression: MSVC fails to build 32 bit binaries
+- Zink on Android: failed to create dri2 screen
+- Amnesia: The Bunker (2023) OpenGL graphics glitch on Intel graphics
+- freedreno, tu: resource leak
+- meson: When building radeonsi without llvm, it fails without setting amd_with_llvm to false explicitly
+- loader.c:156:14: error: call to undeclared function 'drmCommandWriteRead'
+- radv: RB+ for depth-only is broken with unused color attachments
+- win_bison random failure extern_stdin:40: ERROR: end of file in string
+- Texture matrix stack pops do not seem to always update the texture matrix
+- Polaris, amdgpu: Application using VCE wedges GPU
+- radv vulkan video encode does not process used_by_curr_pic_lt_flag correctly
+- [BMG] Metro Exodus Enhanced Edition (1449560) - Crash
+- venus: random failures in dEQP.api.info.image_format_properties2.1d
+
+
+Changes
+-------
+
+Aitor Camacho (2):
+
+- vulkan/cmd_queue: Use vk_strdup and free allocated string memory
+- vulkan/wsi: Fix double destroy of present_id_timeline at swapchain create
+
+Alyssa Rosenzweig (1):
+
+- nir/sweep: fix use-after-free with dominance LCA
+
+Benjamin Cheng (4):
+
+- radv/video: Align each layer of encode DPB to 256
+- radv/video: Fix num_ref_idx_l{0,1} related overrides
+- radv/video: Fix H264/H265 reference selection
+- radv/video: Override direct_spatial_mv_pred to 1
+
+Calder Young (3):
+
+- brw: fix SIMD lowering of fp16 sampler message data with multiple components
+- anv: Fix ray query shadow stack buffer size
+- intel: Fix calculation of max_scratch_ids on fused devices
+
+Christoph Pillmayer (1):
+
+- nir: Fix preseved metadata in sort_unstructured_blocks
+
+Connor Abbott (1):
+
+- tu: Handle case where pipeline writes unused color attachments
+
+Daniel Schürmann (1):
+
+- nir/opt_large_constants: Fix dead deref instructions accessing lowered variables
+
+Dave Airlie (1):
+
+- dozen: return INCOMPATIBLE_DRIVER on instance create failure
+
+David Rosca (7):
+
+- vulkan/video: Fix coding AV1 seq_choose_screen_content_tools = 1
+- radv/video: Fix coding allow_screen_content_tools and force_integer_mv
+- radv/video: Fix coding used_by_curr_pic_lt_flag
+- radeonsi/vce: Add workaround for unaligned input surface
+- radeonsi/vcn: Reduce allocated size for pre-encode recon pics
+- radeonsi/vcn: Fix maybe uninitialized warning
+- radv/video: Fix AV1 quantization map maxQIndexDelta value
+
+Dylan Baker (9):
+
+- docs: Add sha sums for 25.3.0
+- .pick_status.json: Update to 018f45f9812cb5b728e2eb32a5b350efdd9ac90f
+- .pick_status.json: Update to 7a3bfd1f7913819db315e6db8b42520a2d862690
+- .pick_status.json: Update to 7c193ffef1a8316b4b666c9c2ec5afdd2035dd89
+- .pick_status.json: Update to bcedc88f2142d9a2b277c7b848b744e13d426f76
+- .pick_status.json: Update to 076a3834374b2ccd43dd1c7ce73f44795d601300
+- .pick_status.json: Update to a71b4a4b954dcfc1667dc1b36f1f25ac02d3469b
+- .pick_status.json: Update to ac37885fc85203c08f09345c637b6a21f642ed66
+- docs/relnotes/25.3.0: Escape some rst language constructs
+
+Emma Anholt (1):
+
+- tu: Fix leak of compute shader pipeline->base.executables_mem_ctx;
+
+Eric Engestrom (3):
+
+- meson: auto-disable \`amd-use-llvm` when \`llvm=disabled`
+- meson: auto-disable \`draw-use-llvm` when \`llvm=disabled`
+- ci: use $CI_TRON_JOB_PRIORITY tag on all ci-tron jobs
+
+Eric R. Smith (1):
+
+- pan: fix a bifrost disassembly assert failure
+
+Erik Faye-Lund (4):
+
+- pan/kmod: fix priority query logic
+- panfrost: remove stale code
+- mesa/main: correct formatquery error-handling
+- mesa/st: do not enable EXT_texture_buffer_object with rgba only
+
+Faith Ekstrand (1):
+
+- vulkan/drm-syncobj: Stop returning early waiting for sync files
+
+Gil Pedersen (1):
+
+- intel: Add PIPE_FORMAT_R10G10B10X2_UNORM support
+
+Gurchetan Singh (1):
+
+- util: fix arithmetic on a pointer to void warning
+
+Hyunjun Ko (1):
+
+- vulkan/video: Fix H.265 long-term reference handling
+
+Ian Forbes (1):
+
+- svga: Check if Stencil buffer is NULL
+
+Ian Romanick (2):
+
+- brw: Force allow_spilling when spill_all is set
+- lavapipe: fp16 flrp must also be lowered
+
+Icenowy Zheng (1):
+
+- pvr: enable samplerMirrorClampToEdge feature
+
+Karol Herbst (2):
+
+- rusticl/kernel: fix clGetKernelSuggestedLocalWorkSizeKHR implementation
+- rusticl/kernel: Do not run kernels with a workgroup size beyond work_dim
+
+Lionel Landwerlin (15):
+
+- brw: fixup 64bit atomics emulation on 2D array images
+- brw: fix SIMD lowering of sampler messages with fp16 data
+- brw: fix workaround fence rlen field
+- anv: fixup load_ubo lowering
+- anv: ensure slab allocated memory matches image requirements
+- anv: consider 64bit atomics on similar formats with mutable images
+- anv: Wa_18040903259 only applies to RCS when in GPGPU mode
+- brw: compute final copy propagation resulting source
+- nir: fix lower_printf with no arguments
+- spirv: fix printf generation
+- nir/lower_printf: fix array alignment
+- nir/lower_printf: fix missing singleton add
+- vulkan/runtime: track dynamic descriptor offsets for RT pipelines
+- anv: fix broken ray tracing dynamic descriptors
+- anv: add 32-wide subgroup requirement heuristic
+
+Matt Turner (2):
+
+- meson: Fix sysprof-capture-4 dependency
+- meson: Let -Ddraw-use-llvm=false work for R300 on non-x86
+
+Mel Henning (5):
+
+- zink: Return zink_device in create_logical_device
+- zink: Make screen->queue_lock a pointer
+- zink: Create one queue lock per device
+- zink: Lock queue_lock in zink_destroy_screen
+- zink: Lock around screen_debug_marker_{begin,end}
+
+Natalie Vock (3):
+
+- aco/insert_nops: Consider s_setpc target susceptible to VALUReadSGPRHazard
+- radv/rt: Keep updated nodes always active
+- radv/rt: Correctly copy culling flags when updating to separate AS
+
+Olivia Lee (2):
+
+- panvk/csf: fix uninitialized read in draw context
+- panvk/csf: explicitly set ls_sb_slot in set_fbds_provoking_vertex
+
+Patrick Lerda (2):
+
+- r600: fix rv770 read scratch compatibility
+- r600: fix error filters compatibility
+
+Pierre-Eric Pelloux-Prayer (2):
+
+- radeonsi/sqtt: clear out sqtt bo on resize
+- mesa: fix function prototype
+
+Qiang Yu (1):
+
+- glsl: support barrier() for task and mesh shader
+
+Rob Clark (2):
+
+- freedreno/a6xx: Fix UB in convert_color()
+- freedreno: Fix internal VBO reference leak
+
+Samuel Pitoiset (2):
+
+- radv: fix RB+ for depth-only with unused attachments
+- radv: fix per-submit RGP captures on video queues
+
+Sushma Venkatesh Reddy (1):
+
+- drirc: Add anv_assume_full_subgroups for Detroit: Become Human
+
+Tapani Pälli (4):
+
+- drirc/iris: add drirc to disable threaded context
+- drirc: set intel_disable_threaded_context for Amnesia The Bunker
+- anv: add furmark workaround layer
+- anv: add vk_wsi_disable_unordered_submits and enable for GTK
+
+Timothy Arceri (2):
+
+- mesa: fix _mesa_update_texture_matrices()
+- util/driconf: Add linux version of Penumbra fixes
+
+Timur Kristóf (2):
+
+- radv: Disable sparse mapping when unsupported by VM
+- ac/gpu_info: Disable sparse VM mappings pre-Polaris, for now
+
+Valentine Burley (1):
+
+- egl: Disable kopper on Android
+
+Yiwei Zhang (4):
+
+- venus: use seq_cst for ring cs and tail update ordering
+- venus: avoid re-imported dma-buf to have a larger map size
+- venus: properly fix the blob mem mapping size
+- venus: fix racy semaphore feedback counter update
+
+Yonggang Luo (1):
+
+- ci/microsoft: Downgrading WinFlexBison.win_flex_bison to version 2.5.24
+
+Yurii Kolesnykov (1):
+
+- loader: Wrap nouveau_zink_predicate with HAVE_LIBDRM
--- a/docs/relnotes/new_features.txt
+++ b/docs/relnotes/new_features.txt
@ -1,89 +0,0 @@
-EGL_EXT_create_context_robustness support on Panfrost V10+
-GL_ARB_robust_buffer_access_behavior, GL_KHR_robust_buffer_access_behavior and GL_KHR_robustness support on Panfrost
-VK_EXT_mutable_descriptor_type on panvk/v9+
-GL_KHR_robustness on v3d
-VK_ARM_shader_core_builtins on panvk
-VK_KHR_shader_untyped_pointers on anv
-cl_ext_immutable_memory_objects
-VK_KHR_video_encode_intra_refresh on radv
-VK_KHR_video_encode_quantization_map on radv
-GL_ATI_meminfo and GL_NVX_gpu_memory_info on r300
-VK_KHR_shader_untyped_pointers on anv and RADV
-VK_KHR_maintenance8 on NVK
-VK_KHR_maintenance9 on NVK
-cl_khr_semaphore on radeonsi and zink
-cl_khr_external_semaphore on radeonsi and zink
-cl_khr_external_semaphore_sync_fd on radeonsi and zink
-GL_NV_shader_atomic_int64 on radeonsi and Panfrost V9+
-VK_KHR_maintenance7 on panvk/v10+
-VK_KHR_maintenance8 on panvk/v10+
-VK_KHR_maintenance9 on panvk
-VK_AMD_buffer_marker on NVK
-VK_EXT_ycbcr_2plane_444_formats on radv
-Removed VDPAU frontend
-GL_NV_representative_fragment_test on zink
-VK_KHR_maintenance9 on HoneyKrisp
-sparseBinding on panvk/v10+
-sparseResidencyBuffer on panvk/v10+
-Vulkan 1.2 on pvr
-VK_KHR_create_renderpass2 on pvr
-VK_KHR_dedicated_allocation on pvr
-VK_KHR_depth_stencil_resolve on pvr
-VK_KHR_descriptor_update_template on pvr
-VK_KHR_imageless_framebuffer on pvr
-VK_KHR_line_rasterization on pvr
-VK_KHR_maintenance1 on pvr
-VK_KHR_maintenance2 on pvr
-VK_KHR_maintenance3 on pvr
-VK_KHR_multiview on pvr
-VK_KHR_robustness2 on pvr
-VK_KHR_separate_depth_stencil_layouts on pvr
-VK_KHR_shader_draw_parameters on pvr
-VK_KHR_shader_float_controls on pvr
-VK_KHR_shader_subgroup_extended_types on pvr
-VK_KHR_spirv_1_4 on pvr
-VK_KHR_shader_terminate_invocation on pvr
-VK_KHR_swapchain_mutable_format on pvr
-VK_KHR_vertex_attribute_divisor on pvr
-VK_EXT_border_color_swizzle on pvr
-VK_EXT_color_write_enable on pvr
-VK_EXT_custom_border_color on pvr
-VK_EXT_depth_clamp_zero_one on pvr
-VK_EXT_depth_clip_enable on pvr
-VK_EXT_extended_dynamic_state on pvr
-VK_EXT_extended_dynamic_state2 on pvr
-VK_EXT_extended_dynamic_state3 on pvr
-VK_EXT_image_2d_view_of_3d on pvr
-VK_EXT_line_rasterization on pvr
-VK_EXT_physical_device_drm on pvr
-VK_EXT_provoking_vertex on pvr
-VK_EXT_robustness2 on pvr
-VK_EXT_queue_family_foreign on pvr
-VK_EXT_separate_stencil_usage on pvr
-VK_EXT_shader_demote_to_helper_invocation on pvr
-VK_EXT_vertex_attribute_divisor on pvr
-imageCubeArray on pvr
-independentBlend on pvr
-sampleRateShading on pvr
-logicOp on pvr
-drawIndirectFirstInstance on pvr
-alphaToOne on pvr
-samplerAnisotropy on pvr
-shaderStorageImageExtendedFormats on pvr
-shaderStorageImageReadWithoutFormat on pvr
-shaderStorageImageWriteWithoutFormat on pvr
-shaderClipDistance on pvr
-shaderCullDistance on pvr
-VK_EXT_zero_initialize_device_memory on pvr
-VK_KHR_sampler_mirror_clamp_to_edge on pvr
-VK_KHR_shader_non_semantic_info on pvr
-VK_KHR_shader_relaxed_extended_instruction on pvr
-VK_EXT_shader_replicated_composites on pvr
-VK_KHR_device_group_creation on pvr
-VK_KHR_map_memory2 on pvr
-VK_EXT_map_memory_placed on pvr
-VK_KHR_device_group on pvr
-VK_KHR_buffer_device_address on pvr
-GL_EXT_mesh_shader on zink
-VK_KHR_wayland_surface on pvr
-VK_NVX_image_view_handle on NVK
--- a/include/drm-uapi/amdgpu_drm.h
+++ b/include/drm-uapi/amdgpu_drm.h
@ -1489,8 +1489,6 @@ struct drm_amdgpu_info_hw_ip {
 	__u32  available_rings;
 	/** version info: bits 23:16 major, 15:8 minor, 7:0 revision */
 	__u32  ip_discovery_version;
-	/* Userq available slots */
-	__u32  userq_num_slots;
 };

 /* GFX metadata BO sizes and alignment info (in bytes) */
--- a/include/drm-uapi/drm_fourcc.h
+++ b/include/drm-uapi/drm_fourcc.h
@ -979,14 +979,20 @@ extern "C" {
 *               2 = Gob Height 8, Turing+ Page Kind mapping
 *               3 = Reserved for future use.
 *
- * 22:22 s     Sector layout.  On Tegra GPUs prior to Xavier, there is a further
- *             bit remapping step that occurs at an even lower level than the
- *             page kind and block linear swizzles.  This causes the layout of
- *             surfaces mapped in those SOC's GPUs to be incompatible with the
- *             equivalent mapping on other GPUs in the same system.
+ * 22:22 s     Sector layout.  There is a further bit remapping step that occurs
+ * 26:27       at an even lower level than the page kind and block linear
+ *             swizzles.  This causes the bit arrangement of surfaces in memory
+ *             to differ subtly, and prevents direct sharing of surfaces between
+ *             GPUs with different layouts.
 *
- *               0 = Tegra K1 - Tegra Parker/TX2 Layout.
- *               1 = Desktop GPU and Tegra Xavier+ Layout
+ *               0 = Tegra K1 - Tegra Parker/TX2 Layout
+ *               1 = Pre-GB20x, GB20x 32+ bpp, GB10, Tegra Xavier-Orin Layout
+ *               2 = GB20x(Blackwell 2)+ 8 bpp surface layout
+ *               3 = GB20x(Blackwell 2)+ 16 bpp surface layout
+ *               4 = Reserved for future use.
+ *               5 = Reserved for future use.
+ *               6 = Reserved for future use.
+ *               7 = Reserved for future use.
 *
 * 25:23 c     Lossless Framebuffer Compression type.
 *
@ -1001,7 +1007,7 @@ extern "C" {
 *               6 = Reserved for future use
 *               7 = Reserved for future use
 *
- * 55:25 -     Reserved for future use.  Must be zero.
+ * 55:28 -     Reserved for future use.  Must be zero.
 */
 #define DRM_FORMAT_MOD_NVIDIA_BLOCK_LINEAR_2D(c, s, g, k, h) \
 	fourcc_mod_code(NVIDIA, (0x10 | \
@ -1009,6 +1015,7 @@ extern "C" {
 				 (((k) & 0xff) << 12) | \
 				 (((g) & 0x3) << 20) | \
 				 (((s) & 0x1) << 22) | \
+				 (((s) & 0x6) << 25) | \
 				 (((c) & 0x7) << 23)))

 /* To grandfather in prior block linear format modifiers to the above layout,
@ -1017,7 +1024,7 @@ extern "C" {
 * which corresponds to the "generic" kind used for simple single-sample
 * uncompressed color formats on Fermi - Volta GPUs.
 */
-static __inline__ __u64
+static inline __u64
 drm_fourcc_canonicalize_nvidia_format_mod(__u64 modifier)
 {
 	if (!(modifier & 0x10) || (modifier & (0xff << 12)))
--- a/meson.build
+++ b/meson.build
@ -42,7 +42,7 @@ if get_option('layout') != 'mirror'
 endif

 with_llvm = get_option('llvm')
-amd_with_llvm = get_option('amd-use-llvm')
+amd_with_llvm = with_llvm.allowed() and get_option('amd-use-llvm')

 with_mesa_debug = get_option('buildtype') == 'debug'

@ -218,10 +218,11 @@ with_gallium_ethosu = gallium_drivers.contains('ethosu')
 foreach gallium_driver : gallium_drivers
  pre_args += '-DHAVE_@0@'.format(gallium_driver.to_upper())
 endforeach
+draw_with_llvm = with_llvm.allowed() and get_option('draw-use-llvm')
 with_llvm = with_llvm \
  .enable_if(with_gallium_i915, error_message : 'i915 Gallium driver requires LLVM for vertex shaders') \
  .enable_if(with_gallium_llvmpipe, error_message : 'LLVMPipe Gallium driver requires LLVM') \
-  .enable_if(with_gallium_r300, error_message : 'R300 Gallium driver requires LLVM for vertex shaders') \
+  .enable_if(with_gallium_r300 and draw_with_llvm, error_message : 'R300 Gallium driver requires LLVM for vertex shaders on IGP parts') \
  .enable_if(with_gallium_r600 and amd_with_llvm, error_message : 'R600 Gallium driver configured to require LLVM') \
  .enable_if(with_gallium_radeonsi and amd_with_llvm, error_message : 'RadeonSI Gallium driver configured to require LLVM')

@ -1744,7 +1745,6 @@ if with_clc
  # but we don't know what LLVM version we are using yet
  llvm_optional_modules += ['all-targets', 'windowsdriver', 'frontendhlsl', 'frontenddriver']
 endif
-draw_with_llvm = get_option('draw-use-llvm')
 if draw_with_llvm
  llvm_modules += 'native'
  # lto is needded with LLVM>=15, but we don't know what LLVM verrsion we are using yet
@ -1788,8 +1788,12 @@ if dep_llvm.found()
    error('Lavapipe and llvmpipe require LLVM draw support.')
  endif

-  if (with_gallium_i915 or with_gallium_r300) and not draw_with_llvm
-    error('i915 and R300 require LLVM draw support for vertex shaders.')
+  if with_gallium_i915 and not draw_with_llvm
+    error('i915 requires LLVM draw support for vertex shaders.')
+  endif
+
+  if with_gallium_r300 and not draw_with_llvm and host_machine.cpu_family() == 'x86'
+    error('r300 requires LLVM draw support for vertex shaders.')
  endif

  if host_machine.system() != 'windows'
@ -2191,7 +2195,7 @@ endif

 with_sysprof = get_option('sysprof')
 if with_sysprof
-  dep_sysprof = dependency('sysprof-capture-4', version: '>= 3.38.0')
+  dep_sysprof = dependency('sysprof-capture-4', version: '>= 49.0')
  pre_args += '-DHAVE_SYSPROF'
 endif

--- a/src/amd/ci/gitlab-ci-inc.yml
+++ b/src/amd/ci/gitlab-ci-inc.yml
@ -355,6 +355,7 @@
    - farm:$RUNNER_FARM_LOCATION
    - $CI_TRON_DUT_SETUP_TAGS
    - amdgpu:$INTEGRATED_OR_DISCRETE
+    - ci-tron:priority:$CI_TRON_JOB_PRIORITY

 .ci-tron-test-radv:
  extends: .ci-tron-test-amdgpu
@ -502,11 +503,6 @@
    INTEGRATED_OR_DISCRETE: integrated
    GPU_VERSION: radv-vangogh
    FDO_CI_CONCURRENT: 8
-  tags:
-    - farm:$RUNNER_FARM_LOCATION
-    - amdgpu:$INTEGRATED_OR_DISCRETE
-    - $CI_TRON_DUT_SETUP_TAGS
-    - $CI_TRON_JOB_PRIORITY_TAG

 .raphael-test-valve:
  variables:
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -315,8 +315,6 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
         info->ip[ip_type].num_queues = 1;
      } else if (ip_info.available_rings) {
         info->ip[ip_type].num_queues = util_bitcount(ip_info.available_rings);
-      } else if (ip_info.userq_num_slots) {
-         info->ip[ip_type].num_queue_slots = ip_info.userq_num_slots;
      } else {
         continue;
      }
@ -741,8 +739,11 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
   info->has_eqaa_surface_allocator = info->gfx_level < GFX11;
   /* Disable sparse mappings on GFX6 due to VM faults in CP DMA. Enable them once
    * these faults are mitigated in software.
+    * Disable sparse mappings on GFX7-8 due to GPU hangs in the VK CTS,
+    * except Polaris where it happens to work "well enough".
+    * Enable them when these are investigated and fixed in the driver.
    */
-   info->has_sparse_vm_mappings = info->gfx_level >= GFX7;
+   info->has_sparse_vm_mappings = info->family >= CHIP_POLARIS10;
   info->has_gang_submit = info->drm_minor >= 49;
   info->has_gpuvm_fault_query = info->drm_minor >= 55;
   info->has_tmz_support = device_info.ids_flags & AMDGPU_IDS_FLAGS_TMZ;
@ -1696,11 +1697,11 @@ void ac_print_gpu_info(const struct radeon_info *info, FILE *f)
   fprintf(f, "    clock_crystal_freq = %i KHz\n", info->clock_crystal_freq);

   for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) {
-      if (info->ip[i].num_queues || info->ip[i].num_queue_slots) {
-         fprintf(f, "    IP %-7s %2u.%u \tqueues:%u \tqueue_slots:%u \talign:%u \tpad_dw:0x%x\n",
+      if (info->ip[i].num_queues) {
+         fprintf(f, "    IP %-7s %2u.%u \tqueues:%u \talign:%u \tpad_dw:0x%x\n",
                 ac_get_ip_type_string(info, i),
                 info->ip[i].ver_major, info->ip[i].ver_minor, info->ip[i].num_queues,
-                 info->ip[i].num_queue_slots,info->ip[i].ib_alignment, info->ip[i].ib_pad_dw_mask);
+                 info->ip[i].ib_alignment, info->ip[i].ib_pad_dw_mask);
      }
   }

--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@ -26,7 +26,6 @@ struct amd_ip_info {
   uint8_t ver_minor;
   uint8_t ver_rev;
   uint8_t num_queues;
-   uint8_t num_queue_slots;
   uint8_t num_instances;
   uint32_t ib_alignment;
   uint32_t ib_pad_dw_mask;
--- a/src/amd/common/ac_linux_drm.h
+++ b/src/amd/common/ac_linux_drm.h
@ -194,7 +194,6 @@ struct drm_amdgpu_info_hw_ip {
   uint32_t ib_size_alignment;
   uint32_t available_rings;
   uint32_t ip_discovery_version;
-   uint32_t userq_num_slots;
 };

 struct drm_amdgpu_info_uq_fw_areas_gfx {
--- a/src/amd/common/ac_vcn_enc.h
+++ b/src/amd/common/ac_vcn_enc.h
@ -498,6 +498,7 @@ typedef struct rvcn_enc_hevc_encode_params_s {
 typedef struct rvcn_enc_av1_encode_params_s {
   uint32_t ref_frames[RENCODE_AV1_REFS_PER_FRAME];
   uint32_t lsm_reference_frame_index[2];
+   uint32_t cur_order_hint;
 } rvcn_enc_av1_encode_params_t;

 typedef struct rvcn_enc_h264_deblocking_filter_s {
--- a/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c
+++ b/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c
@ -109,23 +109,37 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
   nir_mem_access_size_align res;

   if (intrin == nir_intrinsic_load_shared || intrin == nir_intrinsic_store_shared) {
-      /* Split unsupported shared access. */
-      res.bit_size = MIN2(bit_size, combined_align * 8ull);
-      res.align = res.bit_size / 8;
      /* Don't use >64-bit LDS loads for performance reasons. */
      unsigned max_bytes = intrin == nir_intrinsic_store_shared && cb_data->gfx_level >= GFX7 ? 16 : 8;
      bytes = MIN3(bytes, combined_align, max_bytes);
      bytes = bytes == 12 ? bytes : round_down_to_power_of_2(bytes);
+
+      /* Split unsupported shared access. */
+      res.bit_size = MIN2(bit_size, bytes * 8ull);
+      res.align = res.bit_size / 8;
      res.num_components = bytes / res.align;
      res.shift = nir_mem_access_shift_method_bytealign_amd;
      return res;
   }

+   const bool is_buffer_load = intrin == nir_intrinsic_load_ubo ||
+                               intrin == nir_intrinsic_load_ssbo ||
+                               intrin == nir_intrinsic_load_constant;
+
   if (is_smem) {
+      const bool supported_subdword = cb_data->gfx_level >= GFX12 &&
+                                      intrin != nir_intrinsic_load_push_constant &&
+                                      (!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo);
+
      /* Round up subdword loads if unsupported. */
-      const bool supported_subdword = cb_data->gfx_level >= GFX12 && intrin != nir_intrinsic_load_push_constant;
-      if (bit_size < 32 && (bytes >= 3 || !supported_subdword))
+      if (bytes <= 2 && combined_align % bytes == 0 && supported_subdword) {
+         bit_size = bytes * 8;
+      } else if (bytes % 4 || combined_align % 4) {
+         if (is_buffer_load)
+            bytes += 4 - MIN2(combined_align, 4);
         bytes = align(bytes, 4);
+         bit_size = 32;
+      }

      /* Generally, require an alignment of 4. */
      res.align = MIN2(4, bytes);
@ -138,9 +152,6 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
      if (!util_is_power_of_two_nonzero(bytes) && (cb_data->gfx_level < GFX12 || bytes != 12)) {
         const uint8_t larger = util_next_power_of_two(bytes);
         const uint8_t smaller = larger / 2;
-         const bool is_buffer_load = intrin == nir_intrinsic_load_ubo ||
-                                     intrin == nir_intrinsic_load_ssbo ||
-                                     intrin == nir_intrinsic_load_constant;
         const bool is_aligned = align_mul % smaller == 0;

         /* Overfetch up to 1 dword if this is a bounds-checked buffer load or the access is aligned. */
@ -185,8 +196,8 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui

   const uint32_t max_pad = 4 - MIN2(combined_align, 4);

-   /* Global loads don't have bounds checking, so increasing the size might not be safe. */
-   if (intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_load_global_constant) {
+   /* Global/scratch loads don't have bounds checking, so increasing the size might not be safe. */
+   if (!is_buffer_load) {
      if (align_mul < 4) {
         /* If we split the load, only lower it to 32-bit if this is a SMEM load. */
         const unsigned chunk_bytes = align(bytes, 4) - max_pad;
--- a/src/amd/common/nir/ac_nir_lower_ngg.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg.c
@ -1817,10 +1817,25 @@ ac_ngg_get_scratch_lds_size(mesa_shader_stage stage,
   } else {
      assert(stage == MESA_SHADER_GEOMETRY);

+      /* Repacking output vertices at the end in ngg_gs_finale() uses 1 dword per 4 waves */
      scratch_lds_size = ALIGN(max_num_waves, 4u);
-      /* streamout take 8 dwords for buffer offset and emit vertex per stream */
-      if (streamout_enabled)
-         scratch_lds_size = MAX2(scratch_lds_size, 32);
+
+      /* For streamout:
+       * - Repacking streamout vertices takes 1 dword per 4 waves per stream
+       *   (max 16 bytes for Wave64, 32 bytes for Wave32)
+       * - 1 dword per stream for buffer info
+       *   (16 bytes)
+       * - 1 dword per buffer for buffer info
+       *   (16 bytes)
+       */
+      if (streamout_enabled) {
+         const unsigned num_streams = 4;
+         const unsigned num_so_buffers = 4;
+         const unsigned streamout_scratch_size =
+            num_streams * ALIGN(max_num_waves, 4u) + num_streams * 4 + num_so_buffers * 4;
+
+         scratch_lds_size += streamout_scratch_size;
+      }
   }

   return scratch_lds_size;
--- a/src/amd/common/nir/ac_nir_lower_ngg_gs.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_gs.c
@ -660,6 +660,10 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
   nir_def *export_seq[4] = {0};
   nir_def *out_vtx_primflag[4] = {0};

+   const unsigned scratch_stride = ALIGN(s->max_num_waves, 4);
+   const unsigned scratch_base_off = scratch_stride;
+   const unsigned num_streams = util_bitcount(info->streams_written);
+
   u_foreach_bit(stream, info->streams_written) {
      out_vtx_primflag[stream] =
         ngg_gs_load_out_vtx_primflag(b, stream, tid_in_tg, out_vtx_lds_addr, max_vtxcnt, s);
@ -669,9 +673,8 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
       */
      prim_live[stream] = nir_i2b(b, nir_iand_imm(b, out_vtx_primflag[stream], 1));

-      unsigned scratch_stride = ALIGN(s->max_num_waves, 4);
      nir_def *scratch_base =
-         nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride);
+         nir_iadd_imm(b, s->lds_addr_gs_out_vtx, stream * scratch_stride + scratch_base_off);

      /* We want to export primitives to streamout buffer in sequence,
       * but not all vertices are alive or mark end of a primitive, so
@ -697,18 +700,14 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s)
      export_seq[stream] = rep.repacked_invocation_index;
   }

-   /* Workgroup barrier: wait for LDS scratch reads finish. */
-   nir_barrier(b, .execution_scope = SCOPE_WORKGROUP,
-                      .memory_scope = SCOPE_WORKGROUP,
-                      .memory_semantics = NIR_MEMORY_ACQ_REL,
-                      .memory_modes = nir_var_mem_shared);
-
   /* Get global buffer offset where this workgroup will stream out data to. */
   nir_def *emit_prim[4] = {0};
   nir_def *buffer_offsets[4] = {0};
   nir_def *so_buffer[4] = {0};
+   nir_def *buffer_info_scratch_base =
+      nir_iadd_imm_nuw(b, s->lds_addr_gs_out_vtx, num_streams * scratch_stride + scratch_base_off);
   ac_nir_ngg_build_streamout_buffer_info(b, info, s->options->hw_info->gfx_level, s->options->has_xfb_prim_query,
-                                   s->options->use_gfx12_xfb_intrinsic, s->lds_addr_gs_out_vtx, tid_in_tg,
+                                   s->options->use_gfx12_xfb_intrinsic, buffer_info_scratch_base, tid_in_tg,
                                   gen_prim, so_buffer, buffer_offsets, emit_prim);

   u_foreach_bit(stream, info->streams_written) {
--- a/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
@ -508,6 +508,8 @@ lower_ms_intrinsic(nir_builder *b, nir_instr *instr, void *state)
      return update_ms_barrier(b, intrin, s);
   case nir_intrinsic_load_workgroup_index:
      return lower_ms_load_workgroup_index(b, intrin, s);
+   case nir_intrinsic_load_num_subgroups:
+      return nir_imm_int(b, DIV_ROUND_UP(s->api_workgroup_size, s->wave_size));
   case nir_intrinsic_set_vertex_and_primitive_count:
      return lower_ms_set_vertex_and_primitive_count(b, intrin, s);
   default:
@ -529,6 +531,7 @@ filter_ms_intrinsic(const nir_instr *instr,
          intrin->intrinsic == nir_intrinsic_store_per_primitive_output ||
          intrin->intrinsic == nir_intrinsic_barrier ||
          intrin->intrinsic == nir_intrinsic_load_workgroup_index ||
+          intrin->intrinsic == nir_intrinsic_load_num_subgroups ||
          intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count;
 }

--- a/src/amd/compiler/README-ISA.md
+++ b/src/amd/compiler/README-ISA.md
@ -338,6 +338,17 @@ Only `s_waitcnt_vscnt null, 0`. Needed even if the first instruction is a load.
 NSA MIMG instructions should be limited to 3 dwords before GFX10.3 to avoid
 stability issues: https://reviews.llvm.org/D103348

+## RDNA2 / GFX10.3 hazards
+
+### SALU EXEC write followed by NSA MIMG instruction
+
+Triggered-by:
+Potential stability issues can occur if an SALU instruction changes exec from 0
+to non-zero immediately before an NSA MIMG instruction with 4+ dwords.
+
+Mitigated-by: Any instruction, including `s_nop`.
+
+
 ## RDNA3 / GFX11 hazards

 ### VcmpxPermlaneHazard
--- a/src/amd/compiler/aco_insert_NOPs.cpp
+++ b/src/amd/compiler/aco_insert_NOPs.cpp
@ -129,6 +129,7 @@ struct NOP_ctx_gfx10 {
   bool has_branch_after_DS = false;
   bool has_NSA_MIMG = false;
   bool has_writelane = false;
+   bool has_salu_exec_write = false;
   std::bitset<128> sgprs_read_by_VMEM;
   std::bitset<128> sgprs_read_by_VMEM_store;
   std::bitset<128> sgprs_read_by_DS;
@ -145,6 +146,7 @@ struct NOP_ctx_gfx10 {
      has_branch_after_DS |= other.has_branch_after_DS;
      has_NSA_MIMG |= other.has_NSA_MIMG;
      has_writelane |= other.has_writelane;
+      has_salu_exec_write |= other.has_salu_exec_write;
      sgprs_read_by_VMEM |= other.sgprs_read_by_VMEM;
      sgprs_read_by_DS |= other.sgprs_read_by_DS;
      sgprs_read_by_VMEM_store |= other.sgprs_read_by_VMEM_store;
@ -159,6 +161,7 @@ struct NOP_ctx_gfx10 {
             has_branch_after_VMEM == other.has_branch_after_VMEM && has_DS == other.has_DS &&
             has_branch_after_DS == other.has_branch_after_DS &&
             has_NSA_MIMG == other.has_NSA_MIMG && has_writelane == other.has_writelane &&
+             has_salu_exec_write == other.has_salu_exec_write &&
             sgprs_read_by_VMEM == other.sgprs_read_by_VMEM &&
             sgprs_read_by_DS == other.sgprs_read_by_DS &&
             sgprs_read_by_VMEM_store == other.sgprs_read_by_VMEM_store &&
@ -270,6 +273,8 @@ struct NOP_ctx_gfx11 {
   std::bitset<m0.reg() / 2> sgpr_read_by_valu; /* SGPR pairs, excluding null, exec, m0 and scc */
   std::bitset<m0.reg()> sgpr_read_by_valu_then_wr_by_valu;
   RegCounterMap<11> sgpr_read_by_valu_then_wr_by_salu;
+   /* Force emitting a wait mitigating VALUReadSGPRHazard before the next ALU instruction. */
+   bool force_valu_read_sgpr_wait = false;

   void join(const NOP_ctx_gfx11& other)
   {
@ -290,6 +295,7 @@ struct NOP_ctx_gfx11 {
      sgpr_read_by_valu |= other.sgpr_read_by_valu;
      sgpr_read_by_valu_then_wr_by_valu |= other.sgpr_read_by_valu_then_wr_by_valu;
      sgpr_read_by_valu_then_wr_by_salu.join_min(other.sgpr_read_by_valu_then_wr_by_salu);
+      force_valu_read_sgpr_wait |= other.force_valu_read_sgpr_wait;
   }

   bool operator==(const NOP_ctx_gfx11& other) const
@ -309,7 +315,8 @@ struct NOP_ctx_gfx11 {
                other.sgpr_read_by_valu_as_lanemask_then_wr_by_valu &&
             vgpr_written_by_wmma == other.vgpr_written_by_wmma &&
             sgpr_read_by_valu == other.sgpr_read_by_valu &&
-             sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu;
+             sgpr_read_by_valu_then_wr_by_salu == other.sgpr_read_by_valu_then_wr_by_salu &&
+             force_valu_read_sgpr_wait == other.force_valu_read_sgpr_wait;
   }
 };

@ -907,6 +914,15 @@ handle_instruction_gfx10(State& state, NOP_ctx_gfx10& ctx, aco_ptr<Instruction>&
      ctx.waits_since_fp_atomic = std::min(ctx.waits_since_fp_atomic, 3);
   }

+   /* 4+ dword NSA can hang if exec becomes non-zero again directly before the instruction. */
+   if (instr->isSALU() && instr->writes_exec()) {
+      ctx.has_salu_exec_write = true;
+   } else if (ctx.has_salu_exec_write) {
+      ctx.has_salu_exec_write = false;
+      if (instr->isMIMG() && get_mimg_nsa_dwords(instr.get()) > 1)
+         bld.sopp(aco_opcode::s_nop, 0);
+   }
+
   if (state.program->gfx_level != GFX10)
      return; /* no other hazards/bugs to mitigate */

@ -1590,6 +1606,12 @@ handle_instruction_gfx11(State& state, NOP_ctx_gfx11& ctx, aco_ptr<Instruction>&
         unsigned expiry_count = instr->isSALU() ? 10 : 11;
         uint16_t imm = 0xffff;

+         if (ctx.force_valu_read_sgpr_wait) {
+            imm &= 0xfffe;
+            wait.sa_sdst = 0;
+            ctx.force_valu_read_sgpr_wait = false;
+         }
+
         for (Operand& op : instr->operands) {
            if (op.physReg() >= m0)
               continue;
@ -2019,24 +2041,38 @@ required_export_priority(Program* program)
 void
 insert_NOPs(Program* program)
 {
+   bool has_previous_part =
+      program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog ||
+      (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS &&
+       program->stage.sw != SWStage::TES) ||
+      program->stage == raytracing_cs;
+
   if (program->gfx_level >= GFX11) {
      NOP_ctx_gfx11 initial_ctx;

-      bool has_previous_part =
-         program->is_epilog || program->info.vs.has_prolog || program->info.ps.has_prolog ||
-         (program->info.merged_shader_compiled_separately && program->stage.sw != SWStage::VS &&
-          program->stage.sw != SWStage::TES) || program->stage == raytracing_cs;
      if (program->gfx_level >= GFX12 && has_previous_part) {
         /* resolve_all_gfx11 can't resolve VALUReadSGPRHazard entirely. We have to assume that any
          * SGPR might have been read by VALU if there was a previous shader part.
          */
         initial_ctx.sgpr_read_by_valu.flip();
+         /* We cannot assume the s_setpc source has not been read by VALU in the preceding shader/
+          * shader part, and there are GPU hangs in the wild suggesting that the s_setpc source may
+          * be susceptible to VALUReadSGPRHazard. It is impossible for the previous part to mitigate
+          * this, and it is not always known which register the s_setpc source was in, so force a
+          * wait to be emitted at the start of this part.
+          *
+          * TODO: This hypothesis is not yet conclusively proven. More testing is needed.
+          */
+         initial_ctx.force_valu_read_sgpr_wait = true;
      }

      mitigate_hazards<NOP_ctx_gfx11, handle_instruction_gfx11, resolve_all_gfx11>(program,
                                                                                   initial_ctx);
   } else if (program->gfx_level >= GFX10) {
-      mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program);
+      NOP_ctx_gfx10 initial_ctx;
+      initial_ctx.has_salu_exec_write = has_previous_part;
+      mitigate_hazards<NOP_ctx_gfx10, handle_instruction_gfx10, resolve_all_gfx10>(program,
+                                                                                   initial_ctx);
   } else {
      mitigate_hazards<NOP_ctx_gfx6, handle_instruction_gfx6, resolve_all_gfx6>(program);
   }
--- a/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp
@ -214,6 +214,8 @@ select_rt_prolog(Program* program, ac_shader_config* config,
   bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
            Operand::c32_or_c64(-1u, program->wave_size == 64),
            Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
+   bld.sop2(aco_opcode::s_cselect_b32, Definition(out_launch_size_y, s1),
+            Operand(out_launch_size_y, s1), Operand::c32(1), Operand(scc, s1));
   bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
            Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
   bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
--- a/src/amd/compiler/instruction_selection/aco_select_vs_prolog.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_vs_prolog.cpp
@ -338,6 +338,22 @@ load_unaligned_vs_attrib(Builder& bld, PhysReg dst, Operand desc, Operand index,
   state->current_loads.push_back(load);
 }

+bool
+is_last_attribute_large(const struct aco_vs_prolog_info* pinfo)
+{
+   const struct ac_vtx_format_info* vtx_info_table =
+      ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
+   unsigned last_attribute = pinfo->num_attributes - 1;
+
+   if ((pinfo->misaligned_mask & (1u << last_attribute))) {
+      const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[last_attribute]];
+      if (vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2)
+         return true;
+   }
+
+   return false;
+}
+
 } // namespace

 void
@ -393,9 +409,11 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
      has_nontrivial_divisors && (program->gfx_level <= GFX8 || program->gfx_level >= GFX11);

   int vgpr_offset = pinfo->misaligned_mask & (1u << (pinfo->num_attributes - 1)) ? 0 : -4;
+   const bool is_last_attr_large = is_last_attribute_large(pinfo);

   unsigned num_vgprs = args->num_vgprs_used;
-   PhysReg attributes_start = get_next_vgpr(pinfo->num_attributes * 4, &num_vgprs);
+   PhysReg attributes_start =
+      get_next_vgpr(pinfo->num_attributes * 4 + (is_last_attr_large ? 4 : 0), &num_vgprs);
   PhysReg vertex_index, instance_index, start_instance_vgpr, nontrivial_tmp_vgpr0,
      nontrivial_tmp_vgpr1;
   if (needs_vertex_index)
@ -625,6 +643,14 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_sh
      continue_pc = Operand(prolog_input, s2);
   }

+   /* Wait for all pending VMEM loads when the prolog loads large 64-bit
+    * attributes because the vertex shader isn't required to consume all of
+    * them and they might be overwritten. This isn't the most optimal solution
+    * but 64-bit vertex attributes are rarely used.
+    */
+   if (is_last_attr_large)
+      wait_for_vmem_loads(bld);
+
   bld.sop1(aco_opcode::s_setpc_b64, continue_pc);

   program->config->float_mode = program->blocks[0].fp_mode.val;
--- a/src/amd/compiler/tests/test_insert_nops.cpp
+++ b/src/amd/compiler/tests/test_insert_nops.cpp
@ -1764,8 +1764,12 @@ BEGIN_TEST(insert_nops.valu_read_sgpr.previous_part)
   /* Raytracing shaders have a prolog and may also be split into several parts. */
   program->stage = raytracing_cs;

-   /* Despite the SGPR never being read by a VALU in this shader, a sa_sdst(0) is needed. */
+   /* Despite the SGPR never being read by a VALU in this shader, a sa_sdst(0) is needed.
+    * The first instruction is also a sa_sdst(0) in RT shaders to protect against reads of the
+    * setpc target.
+    */
   //>> p_unit_test 0
+   //! s_waitcnt_depctr sa_sdst(0)
   //! s1: %0:s[4] = s_mov_b32 0
   //! s_waitcnt_depctr sa_sdst(0)
   //! s1: %0:s[64] = s_mov_b32 %0:s[4]
--- a/src/amd/drm-shim/amdgpu_devices.c
+++ b/src/amd/drm-shim/amdgpu_devices.c
@ -1329,7 +1329,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0x1,
         .ip_discovery_version = 0xb0000,
-         .userq_num_slots = 2,
      },
      .hw_ip_compute = {
         .hw_ip_version_major = 11,
@ -1339,7 +1338,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0xf,
         .ip_discovery_version = 0xb0000,
-         .userq_num_slots = 16,
      },
      .fw_gfx_me = {
         .ver = 1486,
@ -1460,7 +1458,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0x1,
         .ip_discovery_version = 0xb0002,
-         .userq_num_slots = 0x0,
      },
      .hw_ip_compute = {
         .hw_ip_version_major = 11,
@ -1470,7 +1467,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0xf,
         .ip_discovery_version = 0xb0002,
-         .userq_num_slots = 0x0,
      },
      .fw_gfx_me = {
         .ver = 2390,
@ -2070,7 +2066,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0x1,
         .ip_discovery_version = 0xb0500,
-         .userq_num_slots = 2,
      },
      .hw_ip_compute = {
         .hw_ip_version_major = 11,
@ -2080,7 +2075,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0xf,
         .ip_discovery_version = 0xb0500,
-         .userq_num_slots = 16,
      },
      .fw_gfx_me = {
         .ver = 29,
@ -2201,7 +2195,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0x1,
         .ip_discovery_version = 0xc0001,
-         .userq_num_slots = 8,
      },
      .hw_ip_compute = {
         .hw_ip_version_major = 12,
@ -2211,7 +2204,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0xf,
         .ip_discovery_version = 0xc0001,
-         .userq_num_slots = 8,
      },
      .fw_gfx_me = {
         .ver = 2590,
--- a/src/amd/drm-shim/amdgpu_dump_states.c
+++ b/src/amd/drm-shim/amdgpu_dump_states.c
@ -379,7 +379,6 @@ amdgpu_dump_hw_ips(int fd)
      printf("   .ib_size_alignment = %u,\n", info.ib_size_alignment);
      printf("   .available_rings = 0x%x,\n", info.available_rings);
      printf("   .ip_discovery_version = 0x%04x,\n", info.ip_discovery_version);
-      printf("   .userq_num_slots = 0x%x,\n", info.userq_num_slots);
      printf("},\n");
   }
 }
--- a/src/amd/vulkan/bvh/update.comp
+++ b/src/amd/vulkan/bvh/update.comp
@ -42,18 +42,14 @@ void main() {
    VOID_REF dst_ptr = OFFSET(dst_bvh, dst_offset);
    uint32_t src_offset = gl_GlobalInvocationID.x * args.geom_data.stride;

-    vk_aabb bounds;
-    bool is_active;
+    vk_aabb bounds = vk_aabb(vec3(0.0f), vec3(0.0f));
    if (args.geom_data.geometry_type == VK_GEOMETRY_TYPE_TRIANGLES_KHR) {
-        is_active = radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x, false);
+        radv_build_triangle(bounds, dst_ptr, args.geom_data, gl_GlobalInvocationID.x, false);
    } else {
        VOID_REF src_ptr = OFFSET(args.geom_data.data, src_offset);
-        is_active = radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x, false);
+        radv_build_aabb(bounds, src_ptr, dst_ptr, args.geom_data.geometry_id, gl_GlobalInvocationID.x, false);
    }

-    if (!is_active)
-        return;
-
    DEREF(INDEX(vk_aabb, args.leaf_bounds, leaf_node_id)) = bounds;
    memoryBarrier(gl_ScopeDevice,
        gl_StorageSemanticsBuffer,
@ -99,6 +95,8 @@ void main() {
        if (!VK_BUILD_FLAG(RADV_BUILD_FLAG_UPDATE_IN_PLACE)) {
            for (uint32_t i = 0; i < 4; ++i)
                DEREF(dst_node).children[i] = children[i];
+            if (VK_BUILD_FLAG(VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS))
+                DEREF(dst_node).flags = DEREF(src_node).flags;
        }

        for (uint32_t i = 0; i < valid_child_count; ++i) {
--- a/src/amd/vulkan/bvh/update.h
+++ b/src/amd/vulkan/bvh/update.h
@ -10,26 +10,14 @@

 #include "encode.h"

-bool
+void
 radv_build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data geom_data, uint32_t global_id,
                    bool gfx12)
 {
-   bool is_valid = true;
   triangle_indices indices = load_indices(geom_data.indices, geom_data.index_format, global_id);

   triangle_vertices vertices = load_vertices(geom_data.data, indices, geom_data.vertex_format, geom_data.stride);

-   /* An inactive triangle is one for which the first (X) component of any vertex is NaN. If any
-    * other vertex component is NaN, and the first is not, the behavior is undefined. If the vertex
-    * format does not have a NaN representation, then all triangles are considered active.
-    */
-   if (isnan(vertices.vertex[0].x) || isnan(vertices.vertex[1].x) || isnan(vertices.vertex[2].x))
-#if ALWAYS_ACTIVE
-      is_valid = false;
-#else
-      return false;
-#endif
-
   if (geom_data.transform != NULL) {
      mat4 transform = mat4(1.0);

@ -61,16 +49,12 @@ radv_build_triangle(inout vk_aabb bounds, VOID_REF dst_ptr, vk_bvh_geometry_data
      radv_encode_triangle_gfx12(dst_ptr, node);
   else
      radv_encode_triangle_gfx10_3(dst_ptr, node);
-
-   return is_valid;
 }

-bool
+void
 radv_build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32_t geometry_id, uint32_t global_id,
                bool gfx12)
 {
-   bool is_valid = true;
-
   for (uint32_t vec = 0; vec < 2; vec++)
      for (uint32_t comp = 0; comp < 3; comp++) {
         float coord = DEREF(INDEX(float, src_ptr, comp + vec * 3));
@ -81,16 +65,6 @@ radv_build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32
            bounds.max[comp] = coord;
      }

-   /* An inactive AABB is one for which the minimum X coordinate is NaN. If any other component is
-    * NaN, and the first is not, the behavior is undefined.
-    */
-   if (isnan(bounds.min.x))
-#if ALWAYS_ACTIVE
-      is_valid = false;
-#else
-      return false;
-#endif
-
   vk_ir_aabb_node node;
   node.base.aabb = bounds;
   node.primitive_id = global_id;
@ -100,8 +74,6 @@ radv_build_aabb(inout vk_aabb bounds, VOID_REF src_ptr, VOID_REF dst_ptr, uint32
      radv_encode_aabb_gfx12(dst_ptr, node);
   else
      radv_encode_aabb_gfx10_3(dst_ptr, node);
-
-   return is_valid;
 }

 #endif
--- a/src/amd/vulkan/layers/radv_no_mans_sky.c
+++ b/src/amd/vulkan/layers/radv_no_mans_sky.c
@ -0,0 +1,35 @@
+/*
+ * Copyright © 2025 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "radv_device.h"
+#include "radv_entrypoints.h"
+#include "radv_image_view.h"
+
+VKAPI_ATTR VkResult VKAPI_CALL
+no_mans_sky_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo,
+                            const VkAllocationCallbacks *pAllocator, VkImageView *pView)
+{
+   VK_FROM_HANDLE(radv_device, device, _device);
+   VkResult result;
+
+   result = device->layer_dispatch.app.CreateImageView(_device, pCreateInfo, pAllocator, pView);
+   if (result != VK_SUCCESS)
+      return result;
+
+   VK_FROM_HANDLE(radv_image_view, iview, *pView);
+
+   if ((iview->vk.aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) &&
+       (iview->vk.usage &
+        (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))) {
+      /* No Man's Sky creates descriptors with depth/stencil aspects (only when Intel XESS is
+       * enabled apparently). and this is illegal in Vulkan. Ignore them by using NULL descriptors
+       * to workaroud GPU hangs.
+       */
+      memset(&iview->descriptor, 0, sizeof(iview->descriptor));
+   }
+
+   return result;
+}
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@ -21,6 +21,7 @@ radv_entrypoints_gen_command += [
  '--device-prefix', 'metro_exodus',
  '--device-prefix', 'rage2',
  '--device-prefix', 'quantic_dream',
+  '--device-prefix', 'no_mans_sky',

  # Command buffer annotation layer entrypoints
  '--device-prefix', 'annotate',
@ -40,6 +41,7 @@ libradv_files = files(
  'layers/radv_metro_exodus.c',
  'layers/radv_rage2.c',
  'layers/radv_quantic_dream.c',
+  'layers/radv_no_mans_sky.c',
  'layers/radv_rmv_layer.c',
  'layers/radv_rra_layer.c',
  'layers/radv_sqtt_layer.c',
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@ -283,6 +283,9 @@ radv_get_build_config(VkDevice _device, struct vk_acceleration_structure_build_s
   if (state->build_info->geometryCount == 1)
      update_key |= RADV_BUILD_FLAG_UPDATE_SINGLE_GEOMETRY;

+   if (device->meta_state.accel_struct_build.build_args.propagate_cull_flags)
+      update_key |= VK_BUILD_FLAG_PROPAGATE_CULL_FLAGS;
+
   state->config.update_key[0] = update_key;
 }

--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -6111,6 +6111,13 @@ radv_emit_tess_domain_origin_state(struct radv_cmd_buffer *cmd_buffer)
   radeon_end();
 }

+static bool
+radv_is_dual_src_enabled(const struct radv_dynamic_state *dynamic_state)
+{
+   /* Dual-source blending must be ignored if blending isn't enabled for MRT0. */
+   return dynamic_state->blend_eq.mrt0_is_dual_src && !!(dynamic_state->color_blend_enable & 1u);
+}
+
 static struct radv_shader_part *
 lookup_ps_epilog(struct radv_cmd_buffer *cmd_buffer)
 {
@ -6144,7 +6151,7 @@ lookup_ps_epilog(struct radv_cmd_buffer *cmd_buffer)

   state.color_write_mask = d->color_write_mask;
   state.color_blend_enable = d->color_blend_enable;
-   state.mrt0_is_dual_src = d->blend_eq.mrt0_is_dual_src;
+   state.mrt0_is_dual_src = radv_is_dual_src_enabled(&cmd_buffer->state.dynamic);

   if (d->vk.ms.alpha_to_coverage_enable) {
      /* Select a color export format with alpha when alpha to coverage is enabled. */
@ -8114,6 +8121,8 @@ radv_mark_descriptors_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPo
   struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);

   descriptors_state->dirty |= descriptors_state->valid;
+   if (descriptors_state->dynamic_offset_count)
+      descriptors_state->dirty_dynamic = true;
 }

 static void
@ -8642,7 +8651,6 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline

      if (cmd_buffer->state.compute_pipeline == compute_pipeline)
         return;
-      radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);

      radv_bind_shader(cmd_buffer, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], MESA_SHADER_COMPUTE);

@ -8656,7 +8664,6 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline

      if (cmd_buffer->state.rt_pipeline == rt_pipeline)
         return;
-      radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);

      radv_bind_shader(cmd_buffer, rt_pipeline->base.base.shaders[MESA_SHADER_INTERSECTION], MESA_SHADER_INTERSECTION);
      radv_bind_rt_prolog(cmd_buffer, rt_pipeline->prolog);
@ -8690,7 +8697,6 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline

      if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
         return;
-      radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);

      radv_foreach_stage (
         stage, (cmd_buffer->state.active_stages | graphics_pipeline->active_stages) & RADV_GRAPHICS_STAGE_BITS) {
@ -8744,6 +8750,8 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
   cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].dynamic_offset_count = pipeline->dynamic_offset_count;
   cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].need_indirect_descriptors =
      pipeline->need_indirect_descriptors;
+
+   radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);
 }

 VKAPI_ATTR void VKAPI_CALL
@ -11688,7 +11696,7 @@ radv_emit_cb_render_state(struct radv_cmd_buffer *cmd_buffer)
   const struct radv_rendering_state *render = &cmd_buffer->state.render;
   const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
   unsigned cb_blend_control[MAX_RTS], sx_mrt_blend_opt[MAX_RTS];
-   const bool mrt0_is_dual_src = d->blend_eq.mrt0_is_dual_src;
+   const bool mrt0_is_dual_src = radv_is_dual_src_enabled(&cmd_buffer->state.dynamic);
   uint32_t cb_color_control = 0;

   const uint32_t cb_target_mask = d->color_write_enable & d->color_write_mask;
@ -11712,7 +11720,7 @@ radv_emit_cb_render_state(struct radv_cmd_buffer *cmd_buffer)
   if (cmd_buffer->state.custom_blend_mode) {
      cb_color_control |= S_028808_MODE(cmd_buffer->state.custom_blend_mode);
   } else {
-      if (d->color_write_mask) {
+      if (render->color_att_count > 0 && d->color_write_mask) {
         cb_color_control |= S_028808_MODE(V_028808_CB_NORMAL);
      } else {
         cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@ -792,6 +792,8 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd
      add_entrypoints(&b, &rage2_device_entrypoints, RADV_APP_DISPATCH_TABLE);
   } else if (!strcmp(instance->drirc.debug.app_layer, "quanticdream")) {
      add_entrypoints(&b, &quantic_dream_device_entrypoints, RADV_APP_DISPATCH_TABLE);
+   } else if (!strcmp(instance->drirc.debug.app_layer, "no_mans_sky")) {
+      add_entrypoints(&b, &no_mans_sky_device_entrypoints, RADV_APP_DISPATCH_TABLE);
   }

   if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
--- a/src/amd/vulkan/radv_formats.c
+++ b/src/amd/vulkan/radv_formats.c
@ -1056,6 +1056,9 @@ radv_get_image_format_properties(struct radv_physical_device *pdev, const VkPhys
   }

   if (info->flags & VK_IMAGE_CREATE_SPARSE_BINDING_BIT) {
+      if (!pdev->info.has_sparse_vm_mappings)
+         goto unsupported;
+
      /* Sparse resources with multi-planar formats are unsupported. */
      if (vk_format_get_plane_count(format) > 1)
         goto unsupported;
--- a/src/amd/vulkan/radv_instance.c
+++ b/src/amd/vulkan/radv_instance.c
@ -173,6 +173,7 @@ static const driOptionDescription radv_dri_options[] = {
      DRI_CONF_VK_LOWER_TERMINATE_TO_DISCARD(false)
      DRI_CONF_VK_WSI_FORCE_BGRA8_UNORM_FIRST(false)
      DRI_CONF_VK_WSI_FORCE_SWAPCHAIN_TO_CURRENT_EXTENT(false)
+      DRI_CONF_VK_WSI_DISABLE_UNORDERED_SUBMITS(false)
      DRI_CONF_VK_X11_IGNORE_SUBOPTIMAL(false)
      DRI_CONF_VK_REQUIRE_ETC2(false)
      DRI_CONF_VK_REQUIRE_ASTC(false)
@ -200,6 +201,7 @@ static const driOptionDescription radv_dri_options[] = {
      DRI_CONF_RADV_EMULATE_RT(false)
      DRI_CONF_RADV_ENABLE_FLOAT16_GFX8(false)
      DRI_CONF_RADV_COOPERATIVE_MATRIX2_NV(false)
+      DRI_CONF_RADV_NO_IMPLICIT_VARYING_SUBGROUP_SIZE(false)
   DRI_CONF_SECTION_END
 };
 // clang-format on
@ -236,6 +238,8 @@ radv_init_dri_debug_options(struct radv_instance *instance)
   drirc->debug.ssbo_non_uniform = driQueryOptionb(&drirc->options, "radv_ssbo_non_uniform");
   drirc->debug.tex_non_uniform = driQueryOptionb(&drirc->options, "radv_tex_non_uniform");
   drirc->debug.zero_vram = driQueryOptionb(&drirc->options, "radv_zero_vram");
+   drirc->debug.no_implicit_varying_subgroup_size =
+      driQueryOptionb(&drirc->options, "radv_no_implicit_varying_subgroup_size");
   drirc->debug.app_layer = driQueryOptionstr(&drirc->options, "radv_app_layer");

   drirc->debug.override_uniform_offset_alignment =
--- a/src/amd/vulkan/radv_instance.h
+++ b/src/amd/vulkan/radv_instance.h
@ -57,6 +57,7 @@ struct radv_drirc {
      bool ssbo_non_uniform;
      bool tex_non_uniform;
      bool zero_vram;
+      bool no_implicit_varying_subgroup_size;
      char *app_layer;
      int override_uniform_offset_alignment;
   } debug;
--- a/src/amd/vulkan/radv_physical_device.c
+++ b/src/amd/vulkan/radv_physical_device.c
@ -252,6 +252,7 @@ radv_physical_device_init_cache_key(struct radv_physical_device *pdev)
   key->use_llvm = pdev->use_llvm;
   key->use_ngg = pdev->use_ngg;
   key->use_ngg_culling = pdev->use_ngg_culling;
+   key->no_implicit_varying_subgroup_size = instance->drirc.debug.no_implicit_varying_subgroup_size;
 }

 static int
@ -866,7 +867,7 @@ radv_physical_device_get_features(const struct radv_physical_device *pdev, struc
      .shaderFloat64 = true,
      .shaderInt64 = true,
      .shaderInt16 = true,
-      .sparseBinding = true,
+      .sparseBinding = pdev->info.has_sparse_vm_mappings,
      .sparseResidencyBuffer = pdev->info.family >= CHIP_POLARIS10,
      .sparseResidencyImage2D = pdev->info.family >= CHIP_POLARIS10,
      .sparseResidencyImage3D = pdev->info.family >= CHIP_POLARIS10,
@ -1526,7 +1527,7 @@ radv_get_physical_device_properties(struct radv_physical_device *pdev)
      .maxMemoryAllocationCount = UINT32_MAX,
      .maxSamplerAllocationCount = 64 * 1024,
      .bufferImageGranularity = 1,
-      .sparseAddressSpaceSize = pdev->info.virtual_address_max,
+      .sparseAddressSpaceSize = pdev->info.has_sparse_vm_mappings ? pdev->info.virtual_address_max : 0,
      .maxBoundDescriptorSets = MAX_SETS,
      .maxPerStageDescriptorSamplers = max_descriptor_set_size,
      .maxPerStageDescriptorUniformBuffers = max_descriptor_set_size,
--- a/src/amd/vulkan/radv_physical_device.h
+++ b/src/amd/vulkan/radv_physical_device.h
@ -64,8 +64,9 @@ struct radv_physical_device_cache_key {
   uint32_t use_llvm : 1;
   uint32_t use_ngg : 1;
   uint32_t use_ngg_culling : 1;
+   uint32_t no_implicit_varying_subgroup_size : 1;

-   uint32_t reserved : 10;
+   uint32_t reserved : 9;
 };

 enum radv_video_enc_hw_ver {
@ -229,7 +230,8 @@ radv_dedicated_sparse_queue_enabled(const struct radv_physical_device *pdev)
 {
   /* Dedicated sparse queue requires VK_QUEUE_SUBMIT_MODE_THREADED, which is incompatible with
    * VK_DEVICE_TIMELINE_MODE_EMULATED. */
-   return pdev->info.has_timeline_syncobj;
+   return pdev->info.has_timeline_syncobj &&
+          pdev->info.has_sparse_vm_mappings;
 }

 static inline bool
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@ -1247,9 +1247,13 @@ radv_pipeline_report_pso_history(const struct radv_device *device, struct radv_p
   case RADV_PIPELINE_RAY_TRACING: {
      struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);

-      radv_print_pso_history(pipeline, rt_pipeline->prolog, output);
+      if (rt_pipeline->prolog)
+         radv_print_pso_history(pipeline, rt_pipeline->prolog, output);

-      for (uint32_t i = 0; i < rt_pipeline->stage_count; i++) {
+      if (pipeline->shaders[MESA_SHADER_INTERSECTION])
+         radv_print_pso_history(pipeline, pipeline->shaders[MESA_SHADER_INTERSECTION], output);
+
+      for (uint32_t i = 0; i < rt_pipeline->non_imported_stage_count; i++) {
         const struct radv_shader *shader = rt_pipeline->stages[i].shader;

         if (shader)
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@ -2383,8 +2383,9 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
         break;
      }
      case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR: {
+         const bool write_memory = radv_video_write_memory_supported(pdev) == RADV_VIDEO_WRITE_MEMORY_SUPPORT_FULL;
         uint32_t *src32 = (uint32_t *)src;
-         uint32_t ready_idx = radv_video_write_memory_supported(pdev) ? RADV_ENC_FEEDBACK_STATUS_IDX : 1;
+         uint32_t ready_idx = write_memory ? RADV_ENC_FEEDBACK_STATUS_IDX : 1;
         uint32_t value;
         do {
            value = p_atomic_read(&src32[ready_idx]);
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -367,6 +367,10 @@ radv_shader_choose_subgroup_size(struct radv_device *device, nir_shader *nir,
      .requiredSubgroupSize = stage_key->subgroup_required_size * 32,
   };

+   /* Do not allow for the SPIR-V 1.6 varying subgroup size rules. */
+   if (pdev->cache_key.no_implicit_varying_subgroup_size)
+      spirv_version = 0x10000;
+
   vk_set_subgroup_size(&device->vk, nir, spirv_version, rss_info.requiredSubgroupSize ? &rss_info : NULL,
                        stage_key->subgroup_allow_varying, stage_key->subgroup_require_full);

--- a/src/amd/vulkan/radv_shader_args.c
+++ b/src/amd/vulkan/radv_shader_args.c
@ -191,6 +191,11 @@ declare_vs_input_vgprs(enum amd_gfx_level gfx_level, const struct radv_shader_in
      unsigned num_attributes = util_last_bit(info->vs.input_slot_usage_mask);
      for (unsigned i = 0; i < num_attributes; i++) {
         ac_add_arg(&args->ac, AC_ARG_VGPR, 4, AC_ARG_VALUE, &args->vs_inputs[i]);
+
+         /* The vertex shader isn't required to consume all components that are loaded by the prolog
+          * and it's possible that more VGPRs are written. This specific case is handled at the end
+          * of the prolog which waits for all pending VMEM loads if needed.
+          */
         args->ac.args[args->vs_inputs[i].arg_index].pending_vmem = true;
      }
   }
--- a/src/amd/vulkan/radv_sqtt.c
+++ b/src/amd/vulkan/radv_sqtt.c
@ -103,12 +103,13 @@ radv_emit_sqtt_userdata(const struct radv_cmd_buffer *cmd_buffer, const void *da
 {
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
   const struct radv_physical_device *pdev = radv_device_physical(device);
+   const bool is_gfx_or_ace = cmd_buffer->qf == RADV_QUEUE_GENERAL || cmd_buffer->qf == RADV_QUEUE_COMPUTE;
   const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
   struct radv_cmd_stream *cs = cmd_buffer->cs;
   const uint32_t *dwords = (uint32_t *)data;

-   /* SQTT user data packets aren't supported on SDMA queues. */
-   if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
+   /* SQTT user data packets are only supported on GFX or ACE queues. */
+   if (!is_gfx_or_ace)
      return;

   while (num_dwords > 0) {
@ -508,7 +509,9 @@ radv_begin_sqtt(struct radv_queue *queue)
      device->sqtt.start_cs[family] = NULL;
   }

-   cs.b = ws->cs_create(ws, radv_queue_ring(queue), false);
+   radv_init_cmd_stream(&cs, radv_queue_ring(queue));
+
+   cs.b = ws->cs_create(ws, cs.hw_ip, false);
   if (!cs.b)
      return false;

@ -585,7 +588,9 @@ radv_end_sqtt(struct radv_queue *queue)
      device->sqtt.stop_cs[family] = NULL;
   }

-   cs.b = ws->cs_create(ws, radv_queue_ring(queue), false);
+   radv_init_cmd_stream(&cs, radv_queue_ring(queue));
+
+   cs.b = ws->cs_create(ws, cs.hw_ip, false);
   if (!cs.b)
      return false;

--- a/src/amd/vulkan/radv_video.c
+++ b/src/amd/vulkan/radv_video.c
@ -24,7 +24,6 @@
 #include "radv_image_view.h"
 #include "radv_video.h"

-#define RADV_VIDEO_H264_MAX_DPB_SLOTS     17
 #define RADV_VIDEO_H264_MAX_NUM_REF_FRAME 16
 #define RADV_VIDEO_H265_MAX_DPB_SLOTS     17
 #define RADV_VIDEO_H265_MAX_NUM_REF_FRAME 15
@ -149,10 +148,16 @@ radv_vcn_write_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t va, unsigned
   struct radv_physical_device *pdev = radv_device_physical(device);
   struct rvcn_sq_var sq;
   struct radv_cmd_stream *cs = cmd_buffer->cs;
+   enum radv_video_write_memory_support support = radv_video_write_memory_supported(pdev);

-   if (!radv_video_write_memory_supported(pdev))
+   if (support == RADV_VIDEO_WRITE_MEMORY_SUPPORT_NONE)
      return;

+   if (support == RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS) {
+      fprintf(stderr, "radv: VCN WRITE_MEMORY requires PCIe atomics support. Expect issues "
+                      "if PCIe atomics are not enabled on current device.\n");
+   }
+
   bool separate_queue = pdev->vid_decode_ip != AMD_IP_VCN_UNIFIED;
   if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC && separate_queue && pdev->vid_dec_reg.data2) {
      radeon_check_space(device->ws, cs->b, 8);
@ -819,6 +824,32 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
   if (cap && !cap->valid)
      cap = NULL;

+   if (cap) {
+      pCapabilities->maxCodedExtent.width = cap->max_width;
+      pCapabilities->maxCodedExtent.height = cap->max_height;
+   } else {
+      switch (pVideoProfile->videoCodecOperation) {
+      case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+         pCapabilities->maxCodedExtent.width = (pdev->info.family < CHIP_TONGA) ? 2048 : 4096;
+         pCapabilities->maxCodedExtent.height = (pdev->info.family < CHIP_TONGA) ? 1152 : 4096;
+         break;
+      case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+         pCapabilities->maxCodedExtent.width =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
+         pCapabilities->maxCodedExtent.height =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
+         break;
+      case VK_VIDEO_CODEC_OPERATION_DECODE_VP9_BIT_KHR:
+         pCapabilities->maxCodedExtent.width =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
+         pCapabilities->maxCodedExtent.height =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
+         break;
+      default:
+         break;
+      }
+   }
+
   pCapabilities->flags = 0;
   pCapabilities->pictureAccessGranularity.width = VK_VIDEO_H264_MACROBLOCK_WIDTH;
   pCapabilities->pictureAccessGranularity.height = VK_VIDEO_H264_MACROBLOCK_HEIGHT;
@ -1118,7 +1149,7 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons

      if (qp_map_caps) {
         qp_map_caps->minQIndexDelta = -255;
-         qp_map_caps->minQIndexDelta = 255;
+         qp_map_caps->maxQIndexDelta = 255;
      }
      break;
   }
@ -1126,32 +1157,6 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
      break;
   }

-   if (cap) {
-      pCapabilities->maxCodedExtent.width = cap->max_width;
-      pCapabilities->maxCodedExtent.height = cap->max_height;
-   } else {
-      switch (pVideoProfile->videoCodecOperation) {
-      case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
-         pCapabilities->maxCodedExtent.width = (pdev->info.family < CHIP_TONGA) ? 2048 : 4096;
-         pCapabilities->maxCodedExtent.height = (pdev->info.family < CHIP_TONGA) ? 1152 : 4096;
-         break;
-      case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
-         pCapabilities->maxCodedExtent.width =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
-         pCapabilities->maxCodedExtent.height =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
-         break;
-      case VK_VIDEO_CODEC_OPERATION_DECODE_VP9_BIT_KHR:
-         pCapabilities->maxCodedExtent.width =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
-         pCapabilities->maxCodedExtent.height =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
-         break;
-      default:
-         break;
-      }
-   }
-
   return VK_SUCCESS;
 }

@ -1746,8 +1751,10 @@ get_h265_msg(struct radv_device *device, struct radv_video_session *vid, struct
   result.bit_depth_luma_minus8 = sps->bit_depth_luma_minus8;
   result.bit_depth_chroma_minus8 = sps->bit_depth_chroma_minus8;
   result.log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_pic_order_cnt_lsb_minus4;
-   result.sps_max_dec_pic_buffering_minus1 =
-      sps->pDecPicBufMgr->max_dec_pic_buffering_minus1[sps->sps_max_sub_layers_minus1];
+   if (sps->pDecPicBufMgr) {
+      result.sps_max_dec_pic_buffering_minus1 =
+         sps->pDecPicBufMgr->max_dec_pic_buffering_minus1[sps->sps_max_sub_layers_minus1];
+   }
   result.log2_min_luma_coding_block_size_minus3 = sps->log2_min_luma_coding_block_size_minus3;
   result.log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_luma_coding_block_size;
   result.log2_min_transform_block_size_minus2 = sps->log2_min_luma_transform_block_size_minus2;
@ -1870,8 +1877,7 @@ get_vp9_msg(struct radv_device *device, struct radv_video_session *vid, struct v
   memset(&result, 0, sizeof(result));

   rvcn_dec_vp9_probs_segment_t *prbs = (rvcn_dec_vp9_probs_segment_t *)(probs_ptr);
-   if (std_pic_info->flags.segmentation_enabled) {
-
+   if (std_pic_info->flags.segmentation_enabled && std_pic_info->pSegmentation) {
      for (unsigned i = 0; i < 8; ++i) {
         prbs->seg.feature_data[i] = (uint16_t)std_pic_info->pSegmentation->FeatureData[i][0] |
                                     ((uint32_t)(std_pic_info->pSegmentation->FeatureData[i][1] & 0xff) << 16) |
@ -1912,12 +1918,12 @@ get_vp9_msg(struct radv_device *device, struct radv_video_session *vid, struct v
   result.frame_header_flags |=
      (std_pic_info->flags.refresh_frame_context << RDECODE_FRAME_HDR_INFO_VP9_REFRESH_FRAME_CONTEXT_SHIFT) &
      RDECODE_FRAME_HDR_INFO_VP9_REFRESH_FRAME_CONTEXT_MASK;
-   if (std_pic_info->flags.segmentation_enabled) {
-      assert(std_pic_info->pSegmentation);
-      result.frame_header_flags |=
-         (std_pic_info->flags.segmentation_enabled << RDECODE_FRAME_HDR_INFO_VP9_SEGMENTATION_ENABLED_SHIFT) &
-         RDECODE_FRAME_HDR_INFO_VP9_SEGMENTATION_ENABLED_MASK;

+   result.frame_header_flags |=
+      (std_pic_info->flags.segmentation_enabled << RDECODE_FRAME_HDR_INFO_VP9_SEGMENTATION_ENABLED_SHIFT) &
+      RDECODE_FRAME_HDR_INFO_VP9_SEGMENTATION_ENABLED_MASK;
+
+   if (std_pic_info->flags.segmentation_enabled && std_pic_info->pSegmentation) {
      result.frame_header_flags |= (std_pic_info->pSegmentation->flags.segmentation_update_map
                                    << RDECODE_FRAME_HDR_INFO_VP9_SEGMENTATION_UPDATE_MAP_SHIFT) &
                                   RDECODE_FRAME_HDR_INFO_VP9_SEGMENTATION_UPDATE_MAP_MASK;
@ -1930,13 +1936,16 @@ get_vp9_msg(struct radv_device *device, struct radv_video_session *vid, struct v
                                    << RDECODE_FRAME_HDR_INFO_VP9_SEGMENTATION_UPDATE_DATA_SHIFT) &
                                   RDECODE_FRAME_HDR_INFO_VP9_SEGMENTATION_UPDATE_DATA_MASK;
   }
-   result.frame_header_flags |= (std_pic_info->pLoopFilter->flags.loop_filter_delta_enabled
-                                 << RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_ENABLED_SHIFT) &
-                                RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_ENABLED_MASK;

-   result.frame_header_flags |= (std_pic_info->pLoopFilter->flags.loop_filter_delta_update
-                                 << RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_UPDATE_SHIFT) &
-                                RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_UPDATE_MASK;
+   if (std_pic_info->pLoopFilter) {
+      result.frame_header_flags |= (std_pic_info->pLoopFilter->flags.loop_filter_delta_enabled
+                                    << RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_ENABLED_SHIFT) &
+                                   RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_ENABLED_MASK;
+
+      result.frame_header_flags |= (std_pic_info->pLoopFilter->flags.loop_filter_delta_update
+                                    << RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_UPDATE_SHIFT) &
+                                   RDECODE_FRAME_HDR_INFO_VP9_MODE_REF_DELTA_UPDATE_MASK;
+   }

   result.frame_header_flags |=
      (std_pic_info->flags.UsePrevFrameMvs << RDECODE_FRAME_HDR_INFO_VP9_USE_PREV_IN_FIND_MV_REFS_SHIFT) &
@ -1949,26 +1958,31 @@ get_vp9_msg(struct radv_device *device, struct radv_video_session *vid, struct v
   result.frame_context_idx = std_pic_info->frame_context_idx;
   result.reset_frame_context = std_pic_info->reset_frame_context;

-   result.filter_level = std_pic_info->pLoopFilter->loop_filter_level;
-   result.sharpness_level = std_pic_info->pLoopFilter->loop_filter_sharpness;
+   uint8_t loop_filter_level = 0;

-   int shifted = std_pic_info->pLoopFilter->loop_filter_level >= 32;
+   if (std_pic_info->pLoopFilter) {
+      loop_filter_level = std_pic_info->pLoopFilter->loop_filter_level;
+      result.filter_level = std_pic_info->pLoopFilter->loop_filter_level;
+      result.sharpness_level = std_pic_info->pLoopFilter->loop_filter_sharpness;
+   }
+
+   int shifted = loop_filter_level >= 32;

   for (int i = 0; i < (std_pic_info->flags.segmentation_enabled ? 8 : 1); i++) {
      const uint8_t seg_lvl_alt_l = 1;
      uint8_t lvl;

-      if (std_pic_info->flags.segmentation_enabled &&
+      if (std_pic_info->flags.segmentation_enabled && std_pic_info->pSegmentation &&
          std_pic_info->pSegmentation->FeatureEnabled[i] & (1 << seg_lvl_alt_l)) {
         lvl = std_pic_info->pSegmentation->FeatureData[i][seg_lvl_alt_l];
         if (!std_pic_info->pSegmentation->flags.segmentation_abs_or_delta_update)
-            lvl += std_pic_info->pLoopFilter->loop_filter_level;
+            lvl += loop_filter_level;
         lvl = CLAMP(lvl, 0, 63);
      } else {
-         lvl = std_pic_info->pLoopFilter->loop_filter_level;
+         lvl = loop_filter_level;
      }

-      if (std_pic_info->pLoopFilter->flags.loop_filter_delta_enabled) {
+      if (std_pic_info->pLoopFilter && std_pic_info->pLoopFilter->flags.loop_filter_delta_enabled) {
         result.lf_adj_level[i][0][0] = result.lf_adj_level[i][0][1] =
            CLAMP(lvl + (std_pic_info->pLoopFilter->loop_filter_ref_deltas[0] * (1 << shifted)), 0, 63);
         for (int j = 1; j < 4; j++) {
@ -1995,7 +2009,8 @@ get_vp9_msg(struct radv_device *device, struct radv_video_session *vid, struct v
   result.log2_tile_rows = std_pic_info->tile_rows_log2;
   result.chroma_format = 1;

-   result.bit_depth_luma_minus8 = result.bit_depth_chroma_minus8 = (std_pic_info->pColorConfig->BitDepth - 8);
+   if (std_pic_info->pColorConfig)
+      result.bit_depth_luma_minus8 = result.bit_depth_chroma_minus8 = (std_pic_info->pColorConfig->BitDepth - 8);
   result.vp9_frame_size = vp9_pic_info->uncompressedHeaderOffset;

   result.compressed_header_size = vp9_pic_info->tilesOffset - vp9_pic_info->compressedHeaderOffset;
@ -2082,16 +2097,20 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v
      (pi->flags.allow_high_precision_mv << RDECODE_FRAME_HDR_INFO_AV1_ALLOW_HIGH_PRECISION_MV_SHIFT) &
      RDECODE_FRAME_HDR_INFO_AV1_ALLOW_HIGH_PRECISION_MV_MASK;

-   result.frame_header_flags |=
-      (seq_hdr->pColorConfig->flags.mono_chrome << RDECODE_FRAME_HDR_INFO_AV1_MONOCHROME_SHIFT) &
-      RDECODE_FRAME_HDR_INFO_AV1_MONOCHROME_MASK;
+   if (seq_hdr->pColorConfig) {
+      result.frame_header_flags |=
+         (seq_hdr->pColorConfig->flags.mono_chrome << RDECODE_FRAME_HDR_INFO_AV1_MONOCHROME_SHIFT) &
+         RDECODE_FRAME_HDR_INFO_AV1_MONOCHROME_MASK;
+   }

   result.frame_header_flags |= (pi->flags.skip_mode_present << RDECODE_FRAME_HDR_INFO_AV1_SKIP_MODE_FLAG_SHIFT) &
                                RDECODE_FRAME_HDR_INFO_AV1_SKIP_MODE_FLAG_MASK;

-   result.frame_header_flags |=
-      (pi->pQuantization->flags.using_qmatrix << RDECODE_FRAME_HDR_INFO_AV1_USING_QMATRIX_SHIFT) &
-      RDECODE_FRAME_HDR_INFO_AV1_USING_QMATRIX_MASK;
+   if (pi->pQuantization) {
+      result.frame_header_flags |=
+         (pi->pQuantization->flags.using_qmatrix << RDECODE_FRAME_HDR_INFO_AV1_USING_QMATRIX_SHIFT) &
+         RDECODE_FRAME_HDR_INFO_AV1_USING_QMATRIX_MASK;
+   }

   result.frame_header_flags |=
      (seq_hdr->flags.enable_filter_intra << RDECODE_FRAME_HDR_INFO_AV1_ENABLE_FILTER_INTRA_SHIFT) &
@ -2135,13 +2154,15 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v
      (pi->flags.force_integer_mv << RDECODE_FRAME_HDR_INFO_AV1_CUR_FRAME_FORCE_INTEGER_MV_SHIFT) &
      RDECODE_FRAME_HDR_INFO_AV1_CUR_FRAME_FORCE_INTEGER_MV_MASK;

-   result.frame_header_flags |=
-      (pi->pLoopFilter->flags.loop_filter_delta_enabled << RDECODE_FRAME_HDR_INFO_AV1_MODE_REF_DELTA_ENABLED_SHIFT) &
-      RDECODE_FRAME_HDR_INFO_AV1_MODE_REF_DELTA_ENABLED_MASK;
+   if (pi->pLoopFilter) {
+      result.frame_header_flags |=
+         (pi->pLoopFilter->flags.loop_filter_delta_enabled << RDECODE_FRAME_HDR_INFO_AV1_MODE_REF_DELTA_ENABLED_SHIFT) &
+         RDECODE_FRAME_HDR_INFO_AV1_MODE_REF_DELTA_ENABLED_MASK;

-   result.frame_header_flags |=
-      (pi->pLoopFilter->flags.loop_filter_delta_update << RDECODE_FRAME_HDR_INFO_AV1_MODE_REF_DELTA_UPDATE_SHIFT) &
-      RDECODE_FRAME_HDR_INFO_AV1_MODE_REF_DELTA_UPDATE_MASK;
+      result.frame_header_flags |=
+         (pi->pLoopFilter->flags.loop_filter_delta_update << RDECODE_FRAME_HDR_INFO_AV1_MODE_REF_DELTA_UPDATE_SHIFT) &
+         RDECODE_FRAME_HDR_INFO_AV1_MODE_REF_DELTA_UPDATE_MASK;
+   }

   result.frame_header_flags |= (pi->flags.delta_q_present << RDECODE_FRAME_HDR_INFO_AV1_DELTA_Q_PRESENT_FLAG_SHIFT) &
                                RDECODE_FRAME_HDR_INFO_AV1_DELTA_Q_PRESENT_FLAG_MASK;
@ -2201,50 +2222,59 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v

   result.sb_size = seq_hdr->flags.use_128x128_superblock;
   result.interp_filter = pi->interpolation_filter;
-   for (i = 0; i < 2; ++i)
-      result.filter_level[i] = pi->pLoopFilter->loop_filter_level[i];
-   result.filter_level_u = pi->pLoopFilter->loop_filter_level[2];
-   result.filter_level_v = pi->pLoopFilter->loop_filter_level[3];
-   result.sharpness_level = pi->pLoopFilter->loop_filter_sharpness;
-   for (i = 0; i < 8; ++i)
-      result.ref_deltas[i] = pi->pLoopFilter->loop_filter_ref_deltas[i];
-   for (i = 0; i < 2; ++i)
-      result.mode_deltas[i] = pi->pLoopFilter->loop_filter_mode_deltas[i];
-   result.base_qindex = pi->pQuantization->base_q_idx;
-   result.y_dc_delta_q = pi->pQuantization->DeltaQYDc;
-   result.u_dc_delta_q = pi->pQuantization->DeltaQUDc;
-   result.v_dc_delta_q = pi->pQuantization->DeltaQVDc;
-   result.u_ac_delta_q = pi->pQuantization->DeltaQUAc;
-   result.v_ac_delta_q = pi->pQuantization->DeltaQVAc;

-   if (pi->pQuantization->flags.using_qmatrix) {
-      result.qm_y = pi->pQuantization->qm_y | 0xf0;
-      result.qm_u = pi->pQuantization->qm_u | 0xf0;
-      result.qm_v = pi->pQuantization->qm_v | 0xf0;
-   } else {
-      result.qm_y = 0xff;
-      result.qm_u = 0xff;
-      result.qm_v = 0xff;
+   if (pi->pLoopFilter) {
+      for (i = 0; i < 2; ++i)
+         result.filter_level[i] = pi->pLoopFilter->loop_filter_level[i];
+      result.filter_level_u = pi->pLoopFilter->loop_filter_level[2];
+      result.filter_level_v = pi->pLoopFilter->loop_filter_level[3];
+      result.sharpness_level = pi->pLoopFilter->loop_filter_sharpness;
+      for (i = 0; i < 8; ++i)
+         result.ref_deltas[i] = pi->pLoopFilter->loop_filter_ref_deltas[i];
+      for (i = 0; i < 2; ++i)
+         result.mode_deltas[i] = pi->pLoopFilter->loop_filter_mode_deltas[i];
   }
+
+   result.qm_y = 0xff;
+   result.qm_u = 0xff;
+   result.qm_v = 0xff;
+
+   if (pi->pQuantization) {
+      result.base_qindex = pi->pQuantization->base_q_idx;
+      result.y_dc_delta_q = pi->pQuantization->DeltaQYDc;
+      result.u_dc_delta_q = pi->pQuantization->DeltaQUDc;
+      result.v_dc_delta_q = pi->pQuantization->DeltaQVDc;
+      result.u_ac_delta_q = pi->pQuantization->DeltaQUAc;
+      result.v_ac_delta_q = pi->pQuantization->DeltaQVAc;
+
+      if (pi->pQuantization->flags.using_qmatrix) {
+         result.qm_y = pi->pQuantization->qm_y | 0xf0;
+         result.qm_u = pi->pQuantization->qm_u | 0xf0;
+         result.qm_v = pi->pQuantization->qm_v | 0xf0;
+      }
+   }
+
   result.delta_q_res = (1 << pi->delta_q_res);
   result.delta_lf_res = (1 << pi->delta_lf_res);
-   result.tile_cols = pi->pTileInfo->TileCols;
-   result.tile_rows = pi->pTileInfo->TileRows;

   result.tx_mode = pi->TxMode;
   result.reference_mode = (pi->flags.reference_select == 1) ? 2 : 0;
-   result.chroma_format = seq_hdr->pColorConfig->flags.mono_chrome ? 0 : 1;
-   result.tile_size_bytes = pi->pTileInfo->tile_size_bytes_minus_1;
-   result.context_update_tile_id = pi->pTileInfo->context_update_tile_id;

-   for (i = 0; i < result.tile_cols; i++)
-      result.tile_col_start_sb[i] = pi->pTileInfo->pMiColStarts[i];
-   result.tile_col_start_sb[result.tile_cols] =
-      result.tile_col_start_sb[result.tile_cols - 1] + pi->pTileInfo->pWidthInSbsMinus1[result.tile_cols - 1] + 1;
-   for (i = 0; i < pi->pTileInfo->TileRows; i++)
-      result.tile_row_start_sb[i] = pi->pTileInfo->pMiRowStarts[i];
-   result.tile_row_start_sb[result.tile_rows] =
-      result.tile_row_start_sb[result.tile_rows - 1] + pi->pTileInfo->pHeightInSbsMinus1[result.tile_rows - 1] + 1;
+   if (pi->pTileInfo) {
+      result.tile_cols = pi->pTileInfo->TileCols;
+      result.tile_rows = pi->pTileInfo->TileRows;
+      result.tile_size_bytes = pi->pTileInfo->tile_size_bytes_minus_1;
+      result.context_update_tile_id = pi->pTileInfo->context_update_tile_id;
+
+      for (i = 0; i < result.tile_cols; i++)
+         result.tile_col_start_sb[i] = pi->pTileInfo->pMiColStarts[i];
+      result.tile_col_start_sb[result.tile_cols] =
+         result.tile_col_start_sb[result.tile_cols - 1] + pi->pTileInfo->pWidthInSbsMinus1[result.tile_cols - 1] + 1;
+      for (i = 0; i < pi->pTileInfo->TileRows; i++)
+         result.tile_row_start_sb[i] = pi->pTileInfo->pMiRowStarts[i];
+      result.tile_row_start_sb[result.tile_rows] =
+         result.tile_row_start_sb[result.tile_rows - 1] + pi->pTileInfo->pHeightInSbsMinus1[result.tile_rows - 1] + 1;
+   }

   result.max_width = seq_hdr->max_frame_width_minus_1 + 1;
   result.max_height = seq_hdr->max_frame_height_minus_1 + 1;
@ -2294,24 +2324,26 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v
         av1_pic_info->referenceNameSlotIndices[i] == -1 ? 0x7f : av1_pic_info->referenceNameSlotIndices[i];
   }

-   result.bit_depth_luma_minus8 = result.bit_depth_chroma_minus8 = seq_hdr->pColorConfig->BitDepth - 8;
-
-   int16_t *feature_data = (int16_t *)probs_ptr;
-   int fd_idx = 0;
-   for (i = 0; i < 8; ++i) {
-      result.feature_mask[i] = pi->pSegmentation->FeatureEnabled[i];
-      for (j = 0; j < 8; ++j) {
-         result.feature_data[i][j] = pi->pSegmentation->FeatureData[i][j];
-         feature_data[fd_idx++] = result.feature_data[i][j];
+   if (pi->pSegmentation) {
+      int16_t *feature_data = (int16_t *)probs_ptr;
+      int fd_idx = 0;
+      for (i = 0; i < 8; ++i) {
+         result.feature_mask[i] = pi->pSegmentation->FeatureEnabled[i];
+         for (j = 0; j < 8; ++j) {
+            result.feature_data[i][j] = pi->pSegmentation->FeatureData[i][j];
+            feature_data[fd_idx++] = result.feature_data[i][j];
+         }
      }
+      memcpy(((char *)probs_ptr + 128), result.feature_mask, 8);
   }

-   memcpy(((char *)probs_ptr + 128), result.feature_mask, 8);
-   result.cdef_damping = pi->pCDEF->cdef_damping_minus_3 + 3;
-   result.cdef_bits = pi->pCDEF->cdef_bits;
-   for (i = 0; i < 8; ++i) {
-      result.cdef_strengths[i] = (pi->pCDEF->cdef_y_pri_strength[i] << 2) + pi->pCDEF->cdef_y_sec_strength[i];
-      result.cdef_uv_strengths[i] = (pi->pCDEF->cdef_uv_pri_strength[i] << 2) + pi->pCDEF->cdef_uv_sec_strength[i];
+   if (pi->pCDEF) {
+      result.cdef_damping = pi->pCDEF->cdef_damping_minus_3 + 3;
+      result.cdef_bits = pi->pCDEF->cdef_bits;
+      for (i = 0; i < 8; ++i) {
+         result.cdef_strengths[i] = (pi->pCDEF->cdef_y_pri_strength[i] << 2) + pi->pCDEF->cdef_y_sec_strength[i];
+         result.cdef_uv_strengths[i] = (pi->pCDEF->cdef_uv_pri_strength[i] << 2) + pi->pCDEF->cdef_uv_sec_strength[i];
+      }
   }

   if (pi->flags.UsesLr) {
@ -2321,9 +2353,13 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v
      }
   }

-   if (seq_hdr->pColorConfig->BitDepth > 8) {
-      result.p010_mode = 1;
-      result.msb_mode = 1;
+   if (seq_hdr->pColorConfig) {
+      result.chroma_format = seq_hdr->pColorConfig->flags.mono_chrome ? 0 : 1;
+      result.bit_depth_luma_minus8 = result.bit_depth_chroma_minus8 = seq_hdr->pColorConfig->BitDepth - 8;
+      if (seq_hdr->pColorConfig->BitDepth > 8) {
+         result.p010_mode = 1;
+         result.msb_mode = 1;
+      }
   }

   result.preskip_segid = 0;
@ -2355,7 +2391,7 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v

   rvcn_dec_film_grain_params_t *fg_params = &result.film_grain;
   fg_params->apply_grain = pi->flags.apply_grain;
-   if (fg_params->apply_grain) {
+   if (fg_params->apply_grain && pi->pFilmGrain) {
      rvcn_dec_av1_fg_init_buf_t *fg_buf = (rvcn_dec_av1_fg_init_buf_t *)((char *)probs_ptr + 256);
      fg_params->random_seed = pi->pFilmGrain->grain_seed;
      fg_params->grain_scale_shift = pi->pFilmGrain->grain_scale_shift;
@ -2401,10 +2437,12 @@ get_av1_msg(struct radv_device *device, struct radv_video_session *vid, struct v
   }

   result.uncompressed_header_size = 0;
-   for (i = 0; i < STD_VIDEO_AV1_NUM_REF_FRAMES; ++i) {
-      result.global_motion[i].wmtype = pi->pGlobalMotion->GmType[i];
-      for (j = 0; j < STD_VIDEO_AV1_GLOBAL_MOTION_PARAMS; ++j)
-         result.global_motion[i].wmmat[j] = pi->pGlobalMotion->gm_params[i][j];
+   if (pi->pGlobalMotion) {
+      for (i = 0; i < STD_VIDEO_AV1_NUM_REF_FRAMES; ++i) {
+         result.global_motion[i].wmtype = pi->pGlobalMotion->GmType[i];
+         for (j = 0; j < STD_VIDEO_AV1_GLOBAL_MOTION_PARAMS; ++j)
+            result.global_motion[i].wmmat[j] = pi->pGlobalMotion->gm_params[i][j];
+      }
   }
   for (i = 0; i < av1_pic_info->tileCount && i < 256; ++i) {
      result.tile_info[i].offset = av1_pic_info->pTileOffsets[i];
@ -2671,8 +2709,8 @@ rvcn_dec_message_decode(struct radv_cmd_buffer *cmd_buffer, struct radv_video_se
          * It will not perform any actual writes to these dummy slots.
          */
         for (int i = 0; i < STD_VIDEO_AV1_NUM_REF_FRAMES; i++) {
-            dynamic_dpb_t2->dpbAddrHi[i] = addr;
-            dynamic_dpb_t2->dpbAddrLo[i] = addr >> 32;
+            dynamic_dpb_t2->dpbAddrLo[i] = addr;
+            dynamic_dpb_t2->dpbAddrHi[i] = addr >> 32;
         }
      }

@ -2918,8 +2956,10 @@ get_uvd_h265_msg(struct radv_device *device, struct radv_video_session *vid, str
   result.bit_depth_luma_minus8 = sps->bit_depth_luma_minus8;
   result.bit_depth_chroma_minus8 = sps->bit_depth_chroma_minus8;
   result.log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_pic_order_cnt_lsb_minus4;
-   result.sps_max_dec_pic_buffering_minus1 =
-      sps->pDecPicBufMgr->max_dec_pic_buffering_minus1[sps->sps_max_sub_layers_minus1];
+   if (sps->pDecPicBufMgr) {
+      result.sps_max_dec_pic_buffering_minus1 =
+         sps->pDecPicBufMgr->max_dec_pic_buffering_minus1[sps->sps_max_sub_layers_minus1];
+   }
   result.log2_min_luma_coding_block_size_minus3 = sps->log2_min_luma_coding_block_size_minus3;
   result.log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_luma_coding_block_size;
   result.log2_min_transform_block_size_minus2 = sps->log2_min_luma_transform_block_size_minus2;
--- a/src/amd/vulkan/radv_video.h
+++ b/src/amd/vulkan/radv_video.h
@ -35,6 +35,8 @@ struct radv_cmd_stream;

 #define RADV_ENC_FEEDBACK_STATUS_IDX 10

+#define RADV_VIDEO_H264_MAX_DPB_SLOTS 17
+
 struct radv_vid_mem {
   struct radv_device_memory *mem;
   VkDeviceSize offset;
@ -73,6 +75,19 @@ struct radv_video_session {
   bool session_initialized;
 };

+/**
+ *  WRITE_MEMORY support in FW.
+ *
+ *  none: Not supported at all. Old VCN FW and all UVD.
+ *  pcie_atomics: Supported, relies on PCIe atomics.
+ *  full: Supported, works also without PCIe atomics.
+ */
+enum radv_video_write_memory_support {
+   RADV_VIDEO_WRITE_MEMORY_SUPPORT_NONE = 0,
+   RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS,
+   RADV_VIDEO_WRITE_MEMORY_SUPPORT_FULL,
+};
+
 VK_DEFINE_NONDISP_HANDLE_CASTS(radv_video_session, vk.base, VkVideoSessionKHR, VK_OBJECT_TYPE_VIDEO_SESSION_KHR)

 void radv_init_physical_device_decoder(struct radv_physical_device *pdev);
@ -98,7 +113,7 @@ void radv_video_get_enc_dpb_image(struct radv_device *device, const struct VkVid
 bool radv_video_decode_vp9_supported(const struct radv_physical_device *pdev);
 bool radv_video_encode_av1_supported(const struct radv_physical_device *pdev);
 bool radv_video_encode_qp_map_supported(const struct radv_physical_device *pdev);
-bool radv_video_write_memory_supported(const struct radv_physical_device *pdev);
+enum radv_video_write_memory_support radv_video_write_memory_supported(const struct radv_physical_device *pdev);
 uint32_t radv_video_get_qp_map_texel_size(VkVideoCodecOperationFlagBitsKHR codec);
 bool radv_check_vcn_fw_version(const struct radv_physical_device *pdev, uint32_t dec, uint32_t enc, uint32_t rev);

--- a/src/amd/vulkan/radv_video_enc.c
+++ b/src/amd/vulkan/radv_video_enc.c
@ -41,7 +41,7 @@
 #define ENC_ALIGNMENT 256

 #define RENCODE_V5_FW_INTERFACE_MAJOR_VERSION 1
-#define RENCODE_V5_FW_INTERFACE_MINOR_VERSION 3
+#define RENCODE_V5_FW_INTERFACE_MINOR_VERSION 10

 #define RENCODE_V4_FW_INTERFACE_MAJOR_VERSION 1
 #define RENCODE_V4_FW_INTERFACE_MINOR_VERSION 11
@ -67,31 +67,6 @@ radv_probe_video_encode(struct radv_physical_device *pdev)
   if (instance->debug_flags & RADV_DEBUG_NO_VIDEO)
      return;

-   if (pdev->info.vcn_ip_version >= VCN_5_0_0) {
-      pdev->video_encode_enabled = true;
-      return;
-   } else if (pdev->info.vcn_ip_version >= VCN_4_0_0) {
-      if (pdev->info.vcn_enc_major_version != RENCODE_V4_FW_INTERFACE_MAJOR_VERSION)
-         return;
-      if (pdev->info.vcn_enc_minor_version < RENCODE_V4_FW_INTERFACE_MINOR_VERSION)
-         return;
-   } else if (pdev->info.vcn_ip_version >= VCN_3_0_0) {
-      if (pdev->info.vcn_enc_major_version != RENCODE_V3_FW_INTERFACE_MAJOR_VERSION)
-         return;
-      if (pdev->info.vcn_enc_minor_version < RENCODE_V3_FW_INTERFACE_MINOR_VERSION)
-         return;
-   } else if (pdev->info.vcn_ip_version >= VCN_2_0_0) {
-      if (pdev->info.vcn_enc_major_version != RENCODE_V2_FW_INTERFACE_MAJOR_VERSION)
-         return;
-      if (pdev->info.vcn_enc_minor_version < RENCODE_V2_FW_INTERFACE_MINOR_VERSION)
-         return;
-   } else {
-      if (pdev->info.vcn_enc_major_version != RENCODE_FW_INTERFACE_MAJOR_VERSION)
-         return;
-      if (pdev->info.vcn_enc_minor_version < RENCODE_FW_INTERFACE_MINOR_VERSION)
-         return;
-   }
-
   /* WRITE_MEMORY is needed for SetEvent and is required to pass CTS */
   if (radv_video_write_memory_supported(pdev)) {
      pdev->video_encode_enabled = true;
@ -495,10 +470,10 @@ radv_enc_session_init(struct radv_cmd_buffer *cmd_buffer, const struct VkVideoEn
   if (pdev->enc_hw_ver >= RADV_VIDEO_ENC_HW_3)
      RADEON_ENC_CS(vid->enc_session.slice_output_enabled);
   RADEON_ENC_CS(vid->enc_session.display_remote);
-   if (pdev->enc_hw_ver == RADV_VIDEO_ENC_HW_4) {
+   if (pdev->enc_hw_ver == RADV_VIDEO_ENC_HW_4)
      RADEON_ENC_CS(vid->enc_session.WA_flags);
+   if (pdev->enc_hw_ver >= RADV_VIDEO_ENC_HW_4)
      RADEON_ENC_CS(0);
-   }
   RADEON_ENC_END();
 }

@ -890,7 +865,6 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
   uint32_t num_bits[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0};
   const struct VkVideoEncodeH264PictureInfoKHR *h264_picture_info =
      vk_find_struct_const(enc_info->pNext, VIDEO_ENCODE_H264_PICTURE_INFO_KHR);
-   int slice_count = h264_picture_info->naluSliceEntryCount;
   const StdVideoEncodeH264PictureInfo *pic = h264_picture_info->pStdPictureInfo;
   const StdVideoH264SequenceParameterSet *sps =
      vk_video_find_h264_enc_std_sps(cmd_buffer->video.params, pic->seq_parameter_set_id);
@ -903,8 +877,6 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
   unsigned int cdw_filled = 0;
   unsigned int bits_copied = 0;

-   assert(slice_count <= 1);
-
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
   const struct radv_physical_device *pdev = radv_device_physical(device);
   struct radv_cmd_stream *cs = cmd_buffer->cs;
@ -961,24 +933,20 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
      radv_enc_code_ue(cmd_buffer, 0);

   if (pic->primary_pic_type == STD_VIDEO_H264_PICTURE_TYPE_B) {
-      radv_enc_code_fixed_bits(cmd_buffer, slice_info->pStdSliceHeader->flags.direct_spatial_mv_pred_flag, 1);
+      radv_enc_code_fixed_bits(cmd_buffer, 1, 1); /* direct_spatial_mv_pred */
   }
   const StdVideoEncodeH264ReferenceListsInfo *ref_lists = pic->pRefLists;
   /* ref_pic_list_modification() */
   if (pic->primary_pic_type != STD_VIDEO_H264_PICTURE_TYPE_IDR &&
       pic->primary_pic_type != STD_VIDEO_H264_PICTURE_TYPE_I) {

-      /* num ref idx active override flag */
-      radv_enc_code_fixed_bits(cmd_buffer, slice_info->pStdSliceHeader->flags.num_ref_idx_active_override_flag, 1);
-      if (slice_info->pStdSliceHeader->flags.num_ref_idx_active_override_flag) {
-         radv_enc_code_ue(cmd_buffer, ref_lists->num_ref_idx_l0_active_minus1);
-         if (pic->primary_pic_type == STD_VIDEO_H264_PICTURE_TYPE_B)
-            radv_enc_code_ue(cmd_buffer, ref_lists->num_ref_idx_l1_active_minus1);
-      }
+      /* it never has to be 1 since we only support one L0/L1 pic */
+      radv_enc_code_fixed_bits(cmd_buffer, 0 /* slice_info->pStdSliceHeader->flags.num_ref_idx_active_override_flag */,
+                               1);

      radv_enc_code_fixed_bits(cmd_buffer, ref_lists->flags.ref_pic_list_modification_flag_l0, 1);
      if (ref_lists->flags.ref_pic_list_modification_flag_l0) {
-         for (unsigned op = 0; op < ref_lists->refList0ModOpCount; op++) {
+         for (unsigned op = 0; op < MIN2(ref_lists->refList0ModOpCount, 1); op++) {
            const StdVideoEncodeH264RefListModEntry *entry = &ref_lists->pRefList0ModOperations[op];

            radv_enc_code_ue(cmd_buffer, entry->modification_of_pic_nums_idc);
@ -994,7 +962,7 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
      if (pic->primary_pic_type == STD_VIDEO_H264_PICTURE_TYPE_B) {
         radv_enc_code_fixed_bits(cmd_buffer, ref_lists->flags.ref_pic_list_modification_flag_l1, 1);
         if (ref_lists->flags.ref_pic_list_modification_flag_l1) {
-            for (unsigned op = 0; op < ref_lists->refList1ModOpCount; op++) {
+            for (unsigned op = 0; op < MIN2(ref_lists->refList1ModOpCount, 1); op++) {
               const StdVideoEncodeH264RefListModEntry *entry = &ref_lists->pRefList1ModOperations[op];

               radv_enc_code_ue(cmd_buffer, entry->modification_of_pic_nums_idc);
@ -1080,6 +1048,9 @@ radv_enc_hevc_st_ref_pic_set(struct radv_cmd_buffer *cmd_buffer, const StdVideoH
   unsigned int num_short_term_ref_pic_sets = sps->num_short_term_ref_pic_sets;
   unsigned int index = num_short_term_ref_pic_sets;

+   if (!rps)
+      return 0;
+
   if (index != 0)
      radv_enc_code_fixed_bits(cmd_buffer, rps->flags.inter_ref_pic_set_prediction_flag, 0x1);

@ -1228,7 +1199,7 @@ radv_enc_slice_header_hevc(struct radv_cmd_buffer *cmd_buffer, const VkVideoEnco
                                           util_logbase2_ceil(sps->num_long_term_ref_pics_sps));
            } else {
               radv_enc_code_fixed_bits(cmd_buffer, lt->poc_lsb_lt[i], sps->log2_max_pic_order_cnt_lsb_minus4 + 4);
-               radv_enc_code_fixed_bits(cmd_buffer, lt->used_by_curr_pic_lt_flag & (1 << i), 1);
+               radv_enc_code_fixed_bits(cmd_buffer, !!(lt->used_by_curr_pic_lt_flag & (1 << i)), 1);
               if (lt->used_by_curr_pic_lt_flag & (1 << i))
                  num_pic_total_curr++;
            }
@ -1254,25 +1225,18 @@ radv_enc_slice_header_hevc(struct radv_cmd_buffer *cmd_buffer, const VkVideoEnco
   }

   if ((pic->pic_type == STD_VIDEO_H265_PICTURE_TYPE_P) || (pic->pic_type == STD_VIDEO_H265_PICTURE_TYPE_B)) {
-      radv_enc_code_fixed_bits(cmd_buffer, slice->flags.num_ref_idx_active_override_flag, 1);
-      if (slice->flags.num_ref_idx_active_override_flag) {
-         radv_enc_code_ue(cmd_buffer, pic->pRefLists->num_ref_idx_l0_active_minus1);
-         if (pic->pic_type == STD_VIDEO_H265_PICTURE_TYPE_B)
-            radv_enc_code_ue(cmd_buffer, pic->pRefLists->num_ref_idx_l1_active_minus1);
-      }
+      /* it never has to be 1 since we only support one L0 pic */
+      radv_enc_code_fixed_bits(cmd_buffer, 0 /* slice->flags.num_ref_idx_active_override_flag */, 1);
+
      if (pps->flags.lists_modification_present_flag && num_pic_total_curr > 1) {
         const StdVideoEncodeH265ReferenceListsInfo *rl = pic->pRefLists;
         unsigned num_pic_bits = util_logbase2_ceil(num_pic_total_curr);
-         unsigned num_ref_l0_minus1 = slice->flags.num_ref_idx_active_override_flag
-                                         ? rl->num_ref_idx_l0_active_minus1
-                                         : pps->num_ref_idx_l0_default_active_minus1;
+         unsigned num_ref_l0_minus1 = 0;
         radv_enc_code_fixed_bits(cmd_buffer, rl->flags.ref_pic_list_modification_flag_l0, 1);
         for (unsigned i = 0; i <= num_ref_l0_minus1; i++)
            radv_enc_code_fixed_bits(cmd_buffer, rl->list_entry_l0[i], num_pic_bits);
         if (pic->pic_type == STD_VIDEO_H265_PICTURE_TYPE_B) {
-            unsigned num_ref_l1_minus1 = slice->flags.num_ref_idx_active_override_flag
-                                            ? rl->num_ref_idx_l1_active_minus1
-                                            : pps->num_ref_idx_l1_default_active_minus1;
+            unsigned num_ref_l1_minus1 = 0;
            radv_enc_code_fixed_bits(cmd_buffer, rl->flags.ref_pic_list_modification_flag_l1, 1);
            for (unsigned i = 0; i <= num_ref_l1_minus1; i++)
               radv_enc_code_fixed_bits(cmd_buffer, rl->list_entry_l1[i], num_pic_bits);
@ -1568,6 +1532,7 @@ radv_enc_ctx2(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInfoKHR *in
         metadata_size += RENCODE_AV1_FRAME_CONTEXT_CDF_TABLE_SIZE;
         metadata_size += RENCODE_AV1_CDEF_ALGORITHM_FRAME_CONTEXT_SIZE;
      }
+      metadata_size = align(metadata_size, ENC_ALIGNMENT);

      uint32_t dpb_array_idx = res->baseArrayLayer + dpb_iv->vk.base_array_layer;
      uint64_t luma_va = dpb_img->bindings[0].addr + dpb_array_idx * (luma_size + chroma_size + metadata_size);
@ -1874,7 +1839,7 @@ radv_enc_params(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInfoKHR *
      switch (h264_pic->primary_pic_type) {
      case STD_VIDEO_H264_PICTURE_TYPE_P:
      case STD_VIDEO_H264_PICTURE_TYPE_B:
-         slot_idx = enc_info->pReferenceSlots[0].slotIndex;
+         slot_idx = h264_pic->pRefLists->RefPicList0[0];
         break;
      default:
         break;
@ -1885,7 +1850,7 @@ radv_enc_params(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInfoKHR *
      switch (h265_pic->pic_type) {
      case STD_VIDEO_H265_PICTURE_TYPE_P:
      case STD_VIDEO_H265_PICTURE_TYPE_B:
-         slot_idx = enc_info->pReferenceSlots[0].slotIndex;
+         slot_idx = h265_pic->pRefLists->RefPicList0[0];
         break;
      default:
         break;
@ -1943,6 +1908,12 @@ radv_enc_params_h264(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInfo

   assert(h264_picture_info);

+   unsigned slot_to_ref_idx[RADV_VIDEO_H264_MAX_DPB_SLOTS];
+   memset(slot_to_ref_idx, 0xFF, sizeof(slot_to_ref_idx));
+   for (unsigned idx = 0; idx < enc_info->referenceSlotCount; idx++) {
+      slot_to_ref_idx[enc_info->pReferenceSlots[idx].slotIndex] = idx;
+   }
+
   const StdVideoEncodeH264PictureInfo *h264_pic = h264_picture_info->pStdPictureInfo;
   unsigned slot_idx_0 = 0xffffffff;
   unsigned slot_idx_1 = 0xffffffff;
@ -1951,14 +1922,17 @@ radv_enc_params_h264(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInfo

   switch (h264_pic->primary_pic_type) {
   case STD_VIDEO_H264_PICTURE_TYPE_P:
-      slot_idx_0 = enc_info->pReferenceSlots[0].slotIndex;
-      slot_info_0 = vk_find_struct_const(enc_info->pReferenceSlots[0].pNext, VIDEO_ENCODE_H264_DPB_SLOT_INFO_KHR);
+      slot_idx_0 = h264_pic->pRefLists->RefPicList0[0];
+      slot_info_0 = vk_find_struct_const(enc_info->pReferenceSlots[slot_to_ref_idx[slot_idx_0]].pNext,
+                                         VIDEO_ENCODE_H264_DPB_SLOT_INFO_KHR);
      break;
   case STD_VIDEO_H264_PICTURE_TYPE_B:
-      slot_idx_0 = enc_info->pReferenceSlots[0].slotIndex;
-      slot_idx_1 = enc_info->pReferenceSlots[1].slotIndex;
-      slot_info_0 = vk_find_struct_const(enc_info->pReferenceSlots[0].pNext, VIDEO_ENCODE_H264_DPB_SLOT_INFO_KHR);
-      slot_info_1 = vk_find_struct_const(enc_info->pReferenceSlots[1].pNext, VIDEO_ENCODE_H264_DPB_SLOT_INFO_KHR);
+      slot_idx_0 = h264_pic->pRefLists->RefPicList0[0];
+      slot_idx_1 = h264_pic->pRefLists->RefPicList1[0];
+      slot_info_0 = vk_find_struct_const(enc_info->pReferenceSlots[slot_to_ref_idx[slot_idx_0]].pNext,
+                                         VIDEO_ENCODE_H264_DPB_SLOT_INFO_KHR);
+      slot_info_1 = vk_find_struct_const(enc_info->pReferenceSlots[slot_to_ref_idx[slot_idx_1]].pNext,
+                                         VIDEO_ENCODE_H264_DPB_SLOT_INFO_KHR);
      break;
   default:
      break;
@ -2046,7 +2020,7 @@ radv_enc_params_hevc(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInfo

   switch (h265_pic->pic_type) {
   case STD_VIDEO_H265_PICTURE_TYPE_P:
-      slot_idx_0 = enc_info->pReferenceSlots[0].slotIndex;
+      slot_idx_0 = h265_pic->pRefLists->RefPicList0[0];
      break;
   default:
      break;
@ -2275,6 +2249,7 @@ radv_enc_params_av1(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInfoK
      RADEON_ENC_CS(av1_picture_info->referenceNameSlotIndices[i]);
   RADEON_ENC_CS(slot_idx_0);
   RADEON_ENC_CS(slot_idx_1);
+   RADEON_ENC_CS(av1_picture_info->pStdPictureInfo->order_hint);
   RADEON_ENC_END();
 }

@ -2475,15 +2450,10 @@ radv_enc_av1_obu_instruction(struct radv_cmd_buffer *cmd_buffer, const VkVideoEn
   /*  disable_cdf_update  */
   radv_enc_code_fixed_bits(cmd_buffer, av1_pic->flags.disable_cdf_update, 1);

-   bool allow_screen_content_tools = false;
-   if (seq->flags.reduced_still_picture_header || av1_pic->flags.allow_screen_content_tools) {
-      /*  allow_screen_content_tools  */
-      allow_screen_content_tools = /*av1_pic->av1_spec_misc.palette_mode_enable ||*/
-         av1_pic->flags.force_integer_mv;
-      radv_enc_code_fixed_bits(cmd_buffer, allow_screen_content_tools ? 1 : 0, 1);
-   }
+   if (seq->seq_force_screen_content_tools == STD_VIDEO_AV1_SELECT_SCREEN_CONTENT_TOOLS)
+      radv_enc_code_fixed_bits(cmd_buffer, av1_pic->flags.allow_screen_content_tools, 1);

-   if (allow_screen_content_tools)
+   if (av1_pic->flags.allow_screen_content_tools && seq->seq_force_integer_mv == STD_VIDEO_AV1_SELECT_INTEGER_MV)
      /*  force_integer_mv  */
      radv_enc_code_fixed_bits(cmd_buffer, av1_pic->flags.force_integer_mv, 1);

@ -2530,7 +2500,7 @@ radv_enc_av1_obu_instruction(struct radv_cmd_buffer *cmd_buffer, const VkVideoEn
         /*  render_height_minus_1  */
         radv_enc_code_fixed_bits(cmd_buffer, av1_pic->render_height_minus_1, 16);
      }
-      if (av1_pic->flags.allow_screen_content_tools && av1_pic->flags.force_integer_mv)
+      if (av1_pic->flags.allow_screen_content_tools)
         /*  allow_intrabc  */
         radv_enc_code_fixed_bits(cmd_buffer, 0, 1);
   } else {
@ -2792,7 +2762,7 @@ radv_vcn_encode_video(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
   cmd_buffer->video.enc.total_task_size = 0;

   // task info
-   radv_enc_task_info(cmd_buffer, true);
+   radv_enc_task_info(cmd_buffer, feedback_query_va);

   if (vid->enc_need_begin) {
      begin(cmd_buffer, enc_info);
@ -2861,7 +2831,8 @@ radv_vcn_encode_video(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf

   if (pdev->enc_hw_ver >= RADV_VIDEO_ENC_HW_2) {
      radv_vcn_sq_tail(cs, &cmd_buffer->video.sq);
-      radv_vcn_write_memory(cmd_buffer, feedback_query_va + RADV_ENC_FEEDBACK_STATUS_IDX * sizeof(uint32_t), 1);
+      if (feedback_query_va && radv_video_write_memory_supported(pdev) == RADV_VIDEO_WRITE_MEMORY_SUPPORT_FULL)
+         radv_vcn_write_memory(cmd_buffer, feedback_query_va + RADV_ENC_FEEDBACK_STATUS_IDX * sizeof(uint32_t), 1);
   }
 }

@ -3163,9 +3134,42 @@ radv_video_patch_encode_session_parameters(struct radv_device *device, struct vk
         if (pdev->enc_hw_ver < RADV_VIDEO_ENC_HW_5 ||
             !params->h264_enc.h264_pps[i].base.flags.entropy_coding_mode_flag)
            params->h264_enc.h264_pps[i].base.flags.transform_8x8_mode_flag = 0;
+
+         params->h264_enc.h264_pps[i].base.num_ref_idx_l0_default_active_minus1 = 0;
+         params->h264_enc.h264_pps[i].base.num_ref_idx_l1_default_active_minus1 = 0;
      }
      break;
   case VK_VIDEO_CODEC_OPERATION_ENCODE_H265_BIT_KHR: {
+      for (unsigned i = 0; i < params->h265_enc.h265_sps_count; i++) {
+         uint32_t pic_width_in_luma_samples =
+             params->h265_enc.h265_sps[i].base.pic_width_in_luma_samples;
+         uint32_t pic_height_in_luma_samples =
+             params->h265_enc.h265_sps[i].base.pic_height_in_luma_samples;
+         uint32_t aligned_pic_width = align(pic_width_in_luma_samples, 64);
+         uint32_t aligned_pic_height = align(pic_height_in_luma_samples, 16);
+
+         /* Override the unaligned pic_{width,height} and make up for it with conformance window
+          * cropping */
+         params->h265_enc.h265_sps[i].base.pic_width_in_luma_samples = aligned_pic_width;
+         params->h265_enc.h265_sps[i].base.pic_height_in_luma_samples = aligned_pic_height;
+
+         if (aligned_pic_width != pic_width_in_luma_samples ||
+             aligned_pic_height != pic_height_in_luma_samples) {
+            params->h265_enc.h265_sps[i].base.flags.conformance_window_flag = 1;
+            params->h265_enc.h265_sps[i].base.conf_win_right_offset +=
+               (aligned_pic_width - pic_width_in_luma_samples) / 2;
+            params->h265_enc.h265_sps[i].base.conf_win_bottom_offset +=
+               (aligned_pic_height - pic_height_in_luma_samples) / 2;
+         }
+
+         /* VCN supports only the following block sizes (resulting in 64x64 CTBs with any coding
+          * block size) */
+         params->h265_enc.h265_sps[i].base.log2_min_luma_coding_block_size_minus3 = 0;
+         params->h265_enc.h265_sps[i].base.log2_diff_max_min_luma_coding_block_size = 3;
+         params->h265_enc.h265_sps[i].base.log2_min_luma_transform_block_size_minus2 = 0;
+         params->h265_enc.h265_sps[i].base.log2_diff_max_min_luma_transform_block_size = 3;
+      }
+
      for (unsigned i = 0; i < params->h265_enc.h265_pps_count; i++) {
         /* cu_qp_delta needs to be enabled if rate control is enabled. VCN2 and newer can also enable
          * it with rate control disabled. Since we don't know what rate control will be used, we
@ -3178,6 +3182,9 @@ radv_video_patch_encode_session_parameters(struct radv_device *device, struct vk
         params->h265_enc.h265_pps[i].base.flags.dependent_slice_segments_enabled_flag = 1;
         if (pdev->enc_hw_ver < RADV_VIDEO_ENC_HW_3)
            params->h265_enc.h265_pps[i].base.flags.transform_skip_enabled_flag = 0;
+
+         params->h265_enc.h265_pps[i].base.num_ref_idx_l0_default_active_minus1 = 0;
+         params->h265_enc.h265_pps[i].base.num_ref_idx_l1_default_active_minus1 = 0;
      }
      break;
   }
@ -3268,6 +3275,14 @@ radv_GetEncodedVideoSessionParametersKHR(VkDevice device,
         assert(sps);
         char *data_ptr = pData ? (char *)pData + vps_size : NULL;
         vk_video_encode_h265_sps(sps, size_limit, &sps_size, data_ptr);
+
+         if (pFeedbackInfo) {
+            struct VkVideoEncodeH265SessionParametersFeedbackInfoKHR *h265_feedback_info =
+               vk_find_struct(pFeedbackInfo->pNext, VIDEO_ENCODE_H265_SESSION_PARAMETERS_FEEDBACK_INFO_KHR);
+            pFeedbackInfo->hasOverrides = VK_TRUE;
+            if (h265_feedback_info)
+               h265_feedback_info->hasStdSPSOverrides = VK_TRUE;
+         }
      }
      if (h265_get_info->writeStdPPS) {
         const StdVideoH265PictureParameterSet *pps = vk_video_find_h265_enc_std_pps(templ, h265_get_info->stdPPSId);
@ -3386,17 +3401,20 @@ radv_video_get_enc_dpb_image(struct radv_device *device, const struct VkVideoPro
   }

   for (unsigned i = 0; i < num_reconstructed_pictures; i++) {
-      image->size += luma_size;
-      image->size += chroma_size;
+      unsigned metadata_size = 0;
      if (is_av1) {
-         image->size += RENCODE_AV1_FRAME_CONTEXT_CDF_TABLE_SIZE;
-         image->size += RENCODE_AV1_CDEF_ALGORITHM_FRAME_CONTEXT_SIZE;
+         metadata_size += RENCODE_AV1_FRAME_CONTEXT_CDF_TABLE_SIZE;
+         metadata_size += RENCODE_AV1_CDEF_ALGORITHM_FRAME_CONTEXT_SIZE;
      }
      if (pdev->enc_hw_ver >= RADV_VIDEO_ENC_HW_5) {
-         image->size += RENCODE_MAX_METADATA_BUFFER_SIZE_PER_FRAME;
+         metadata_size += RENCODE_MAX_METADATA_BUFFER_SIZE_PER_FRAME;
         if (has_h264_b_support)
-            image->size += colloc_bytes;
+            metadata_size += colloc_bytes;
      }
+
+      image->size += luma_size;
+      image->size += chroma_size;
+      image->size += align(metadata_size, ENC_ALIGNMENT);
   }
   image->alignment = ENC_ALIGNMENT;
 }
@ -3421,17 +3439,20 @@ radv_video_encode_qp_map_supported(const struct radv_physical_device *pdev)
   return true;
 }

-bool
+enum radv_video_write_memory_support
 radv_video_write_memory_supported(const struct radv_physical_device *pdev)
 {
-   if (pdev->info.vcn_ip_version >= VCN_5_0_0)
-      return true;
-   else if (pdev->info.vcn_ip_version >= VCN_4_0_0)
-      return pdev->info.vcn_enc_minor_version >= 22;
-   else if (pdev->info.vcn_ip_version >= VCN_3_0_0)
-      return pdev->info.vcn_enc_minor_version >= 33;
-   else if (pdev->info.vcn_ip_version >= VCN_2_0_0)
-      return pdev->info.vcn_enc_minor_version >= 24;
-   else /* VCN 1 and UVD */
-      return false;
+   if (pdev->info.vcn_ip_version >= VCN_5_0_0) {
+      return RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS;
+   } else if (pdev->info.vcn_ip_version >= VCN_4_0_0) {
+      if (pdev->info.vcn_enc_minor_version >= 22)
+         return RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS;
+   } else if (pdev->info.vcn_ip_version >= VCN_3_0_0) {
+      if (pdev->info.vcn_enc_minor_version >= 33)
+         return RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS;
+   } else if (pdev->info.vcn_ip_version >= VCN_2_0_0) {
+      if (pdev->info.vcn_enc_minor_version >= 24)
+         return RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS;
+   }
+   return RADV_VIDEO_WRITE_MEMORY_SUPPORT_NONE;
 }
--- a/src/amd/vulkan/winsys/null/radv_null_winsys.c
+++ b/src/amd/vulkan/winsys/null/radv_null_winsys.c
@ -158,6 +158,11 @@ radv_null_winsys_query_info(struct radeon_winsys *rws, struct radeon_info *gpu_i
       gpu_info->family == CHIP_RAVEN2 || gpu_info->family == CHIP_RENOIR || gpu_info->gfx_level >= GFX10_3);

   gpu_info->has_gang_submit = true;
+   gpu_info->mesh_fast_launch_2 = gpu_info->gfx_level >= GFX11;
+   gpu_info->hs_offchip_workgroup_dw_size = gpu_info->family == CHIP_HAWAII ? 4096 : 8192;
+   gpu_info->has_ls_vgpr_init_bug = gpu_info->family == CHIP_VEGA10 || gpu_info->family == CHIP_RAVEN;
+   gpu_info->has_graphics = true;
+   gpu_info->ip[AMD_IP_GFX].num_queues = 1;

   gpu_info->gart_page_size = 4096;
 }
--- a/src/asahi/ci/asahi-g13g-fails.txt
+++ b/src/asahi/ci/asahi-g13g-fails.txt
--- a/src/asahi/layout/layout.c
+++ b/src/asahi/layout/layout.c
@ -24,7 +24,8 @@ ail_initialize_linear(struct ail_layout *layout)
   layout->layer_stride_B = align64(
      (uint64_t)layout->linear_stride_B * layout->height_px, AIL_CACHELINE);

-   layout->size_B = layout->layer_stride_B * layout->depth_px;
+   layout->size_B =
+      layout->level_offsets_B[0] + (layout->layer_stride_B * layout->depth_px);
 }

 /*
@ -341,6 +342,7 @@ ail_make_miptree(struct ail_layout *layout)
      assert(layout->linear_stride_B == 0 && "Invalid nonlinear layout");
      assert(layout->levels >= 1 && "Invalid dimensions");
      assert(layout->sample_count_sa >= 1 && "Invalid sample count");
+      assert(layout->level_offsets_B[0] == 0 && "Invalid offset");
   }

   assert(!(layout->writeable_image && layout->compressed) &&
--- a/src/asahi/lib/agx_device_virtio.c
+++ b/src/asahi/lib/agx_device_virtio.c
@ -133,6 +133,7 @@ agx_virtio_bo_bind(struct agx_device *dev, struct drm_asahi_gem_bind_op *ops,
   memcpy(req->payload, ops, payload_size);

   int ret = vdrm_send_req(dev->vdrm, &req->hdr, false);
+   free(req);
   if (ret) {
      fprintf(stderr, "ASAHI_CCMD_GEM_BIND failed: %d\n", ret);
   }
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@ -992,28 +992,34 @@ hk_CmdEndRendering(VkCommandBuffer commandBuffer)
   }
 }

+static void
+hk_init_heap(const void *data) {
+   struct hk_cmd_buffer *cmd = (struct hk_cmd_buffer *) data;
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   perf_debug(cmd, "Allocating heap");
+
+   size_t size = 128 * 1024 * 1024;
+   dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
+
+   /* The geometry state buffer is initialized here and then is treated by
+    * the CPU as rodata, even though the GPU uses it for scratch internally.
+    */
+   off_t off = dev->rodata.heap - dev->rodata.bo->va->addr;
+   struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off;
+
+   *map = (struct agx_heap){
+      .base = dev->heap->va->addr,
+      .size = size,
+   };
+}
+
 static uint64_t
 hk_heap(struct hk_cmd_buffer *cmd)
 {
   struct hk_device *dev = hk_cmd_buffer_device(cmd);

-   if (unlikely(!dev->heap)) {
-      perf_debug(cmd, "Allocating heap");
-
-      size_t size = 128 * 1024 * 1024;
-      dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
-
-      /* The geometry state buffer is initialized here and then is treated by
-       * the CPU as rodata, even though the GPU uses it for scratch internally.
-       */
-      off_t off = dev->rodata.heap - dev->rodata.bo->va->addr;
-      struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off;
-
-      *map = (struct agx_heap){
-         .base = dev->heap->va->addr,
-         .size = size,
-      };
-   }
+   util_call_once_data(&dev->heap_init_once, hk_init_heap, cmd);

   /* We need to free all allocations after each command buffer execution */
   if (!cmd->uses_heap) {
--- a/src/asahi/vulkan/hk_descriptor_set_layout.c
+++ b/src/asahi/vulkan/hk_descriptor_set_layout.c
@ -330,6 +330,7 @@ hk_GetDescriptorSetLayoutSupport(
   uint64_t non_variable_size = 0;
   uint32_t variable_stride = 0;
   uint32_t variable_count = 0;
+   bool variable_is_inline_uniform_block = false;
   uint8_t dynamic_buffer_count = 0;

   for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) {
@ -362,6 +363,10 @@ hk_GetDescriptorSetLayoutSupport(
             */
            variable_count = MAX2(1, binding->descriptorCount);
            variable_stride = stride;
+
+            variable_is_inline_uniform_block =
+               binding->descriptorType ==
+               VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
         } else {
            /* Since we're aligning to the maximum and since this is just a
             * check for whether or not the max buffer size is big enough, we
@ -393,12 +398,21 @@ hk_GetDescriptorSetLayoutSupport(
      switch (ext->sType) {
      case VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT: {
         VkDescriptorSetVariableDescriptorCountLayoutSupport *vs = (void *)ext;
+         uint32_t max_var_count;
+
         if (variable_stride > 0) {
-            vs->maxVariableDescriptorCount =
+            max_var_count =
               (max_buffer_size - non_variable_size) / variable_stride;
         } else {
-            vs->maxVariableDescriptorCount = 0;
+            max_var_count = 0;
         }
+
+         if (variable_is_inline_uniform_block) {
+            max_var_count =
+               MIN2(max_var_count, HK_MAX_INLINE_UNIFORM_BLOCK_SIZE);
+         }
+
+         vs->maxVariableDescriptorCount = max_var_count;
         break;
      }

--- a/src/asahi/vulkan/hk_device.h
+++ b/src/asahi/vulkan/hk_device.h
@ -92,6 +92,7 @@ struct hk_device {
    * expected to be a legitimate problem. If it is, we can rework later.
    */
   struct agx_bo *heap;
+   util_once_flag heap_init_once;

   struct {
      struct agx_scratch vs, fs, cs;
--- a/src/asahi/vulkan/hk_format.c
+++ b/src/asahi/vulkan/hk_format.c
@ -67,7 +67,7 @@ get_drm_format_modifier_properties_list(
      {
         *out_props = (VkDrmFormatModifierPropertiesEXT){
            .drmFormatModifier = mod,
-            .drmFormatModifierPlaneCount = 1 /* no planar mods */,
+            .drmFormatModifierPlaneCount = vk_format_get_plane_count(vk_format),
            .drmFormatModifierTilingFeatures = flags,
         };
      };
@ -96,7 +96,7 @@ get_drm_format_modifier_properties_list_2(
      {
         *out_props = (VkDrmFormatModifierProperties2EXT){
            .drmFormatModifier = mod,
-            .drmFormatModifierPlaneCount = 1, /* no planar mods */
+            .drmFormatModifierPlaneCount = vk_format_get_plane_count(vk_format),
            .drmFormatModifierTilingFeatures = flags,
         };
      };
--- a/src/asahi/vulkan/hk_image.c
+++ b/src/asahi/vulkan/hk_image.c
@ -1424,6 +1424,13 @@ hk_copy_memory_to_image(struct hk_device *device, struct hk_image *dst_image,
   uint32_t src_height = info->memoryImageHeight ?: extent.height;

   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+
+   /* Align width and height to block */
+   src_width =
+      DIV_ROUND_UP(src_width, util_format_get_blockwidth(layout->format));
+   src_height =
+      DIV_ROUND_UP(src_height, util_format_get_blockheight(layout->format));
+
   uint32_t src_pitch = src_width * blocksize_B;

   unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D)
@ -1496,6 +1503,13 @@ hk_copy_image_to_memory(struct hk_device *device, struct hk_image *src_image,
 #endif

   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+
+   /* Align width and height to block */
+   dst_width =
+      DIV_ROUND_UP(dst_width, util_format_get_blockwidth(layout->format));
+   dst_height =
+      DIV_ROUND_UP(dst_height, util_format_get_blockheight(layout->format));
+
   uint32_t dst_pitch = dst_width * blocksize_B;

   unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D)
@ -1649,11 +1663,6 @@ hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
            &device->physical_device->ubwc_config);
 #endif
      } else {
-         /* Work tile-by-tile, holding the unswizzled tile in a temporary
-          * buffer.
-          */
-         char temp_tile[16384];
-
         unsigned src_level = info->srcSubresource.mipLevel;
         unsigned dst_level = info->dstSubresource.mipLevel;
         uint32_t block_width = src_layout->tilesize_el[src_level].width_el;
@ -1667,6 +1676,12 @@ hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
         }

         uint32_t temp_pitch = block_width * src_block_B;
+         size_t temp_tile_size = temp_pitch * (src_offset.y + extent.height);
+
+         /* Work tile-by-tile, holding the unswizzled tile in a temporary
+          * buffer.
+          */
+         char *temp_tile = malloc(temp_tile_size);

         for (unsigned by = src_offset.y / block_height;
              by * block_height < src_offset.y + extent.height; by++) {
@ -1683,14 +1698,14 @@ hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
                  MIN2((bx + 1) * block_width, src_offset.x + extent.width) -
                  src_x_start;

-               assert(height * temp_pitch <= ARRAY_SIZE(temp_tile));
-
               ail_detile((void *)src, temp_tile, src_layout, src_level,
                          temp_pitch, src_x_start, src_y_start, width, height);
               ail_tile(dst, temp_tile, dst_layout, dst_level, temp_pitch,
                        dst_x_start, dst_y_start, width, height);
            }
         }
+
+         free(temp_tile);
      }
   }
 }
--- a/src/asahi/vulkan/hk_physical_device.c
+++ b/src/asahi/vulkan/hk_physical_device.c
@ -859,7 +859,7 @@ hk_get_device_properties(const struct agx_device *dev,
      .maxSubgroupSize = 32,
      .maxComputeWorkgroupSubgroups = 1024 / 32,
      .requiredSubgroupSizeStages = 0,
-      .maxInlineUniformBlockSize = 1 << 16,
+      .maxInlineUniformBlockSize = HK_MAX_INLINE_UNIFORM_BLOCK_SIZE,
      .maxPerStageDescriptorInlineUniformBlocks = 32,
      .maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 32,
      .maxDescriptorSetInlineUniformBlocks = 6 * 32,
@ -953,7 +953,7 @@ hk_get_device_properties(const struct agx_device *dev,
      .robustUniformBufferAccessSizeAlignment = HK_MIN_UBO_ALIGNMENT,

      /* VK_EXT_sample_locations */
-      .sampleLocationSampleCounts = sample_counts,
+      .sampleLocationSampleCounts = sample_counts & ~VK_SAMPLE_COUNT_1_BIT,
      .maxSampleLocationGridSize = (VkExtent2D){1, 1},
      .sampleLocationCoordinateRange[0] = 0.0f,
      .sampleLocationCoordinateRange[1] = 0.9375f,
--- a/src/asahi/vulkan/hk_private.h
+++ b/src/asahi/vulkan/hk_private.h
@ -12,18 +12,19 @@
 #include "vk_log.h"
 #include "vk_util.h"

-#define HK_MAX_SETS                   8
-#define HK_MAX_PUSH_SIZE              256
-#define HK_MAX_DYNAMIC_BUFFERS        64
-#define HK_MAX_RTS                    8
-#define HK_MIN_SSBO_ALIGNMENT         16
-#define HK_MIN_TEXEL_BUFFER_ALIGNMENT 16
-#define HK_MIN_UBO_ALIGNMENT          64
-#define HK_MAX_VIEWPORTS              16
-#define HK_MAX_DESCRIPTOR_SIZE        64
-#define HK_MAX_PUSH_DESCRIPTORS       32
-#define HK_MAX_DESCRIPTOR_SET_SIZE    (1u << 30)
-#define HK_MAX_DESCRIPTORS            (1 << 20)
+#define HK_MAX_SETS                      8
+#define HK_MAX_PUSH_SIZE                 256
+#define HK_MAX_DYNAMIC_BUFFERS           64
+#define HK_MAX_RTS                       8
+#define HK_MIN_SSBO_ALIGNMENT            16
+#define HK_MIN_TEXEL_BUFFER_ALIGNMENT    16
+#define HK_MIN_UBO_ALIGNMENT             64
+#define HK_MAX_VIEWPORTS                 16
+#define HK_MAX_DESCRIPTOR_SIZE           64
+#define HK_MAX_PUSH_DESCRIPTORS          32
+#define HK_MAX_DESCRIPTOR_SET_SIZE       (1u << 30)
+#define HK_MAX_INLINE_UNIFORM_BLOCK_SIZE (1u << 16)
+#define HK_MAX_DESCRIPTORS               (1 << 20)
 #define HK_PUSH_DESCRIPTOR_SET_SIZE                                            \
   (HK_MAX_PUSH_DESCRIPTORS * HK_MAX_DESCRIPTOR_SIZE)
 #define HK_SSBO_BOUNDS_CHECK_ALIGNMENT 4
--- a/src/asahi/vulkan/hk_queue.c
+++ b/src/asahi/vulkan/hk_queue.c
@ -812,11 +812,6 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue,
   /* Now setup the command structs */
   struct util_dynarray payload;
   util_dynarray_init(&payload, NULL);
-   union drm_asahi_cmd *cmds = malloc(sizeof(*cmds) * command_count);
-   if (cmds == NULL) {
-      free(cmds);
-      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
-   }

   unsigned nr_vdm = 0, nr_cdm = 0;

--- a/src/broadcom/ci/gitlab-ci-inc.yml
+++ b/src/broadcom/ci/gitlab-ci-inc.yml
@ -223,6 +223,7 @@
  tags:
    - farm:igalia
    - dt_gpu:model:$RPI_MODEL
+    - ci-tron:priority:$CI_TRON_JOB_PRIORITY


 # For RPI_KERNEL, see KERNEL in
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@ -1582,9 +1582,7 @@ enumerate_devices(struct vk_instance *vk_instance)
         break;
   }

-   assert(primary_fd >= 0);
-
-   if (render_fd < 0)
+   if (render_fd < 0 || primary_fd < 0)
      result = VK_ERROR_INCOMPATIBLE_DRIVER;
   else
      result = create_physical_device(instance, primary_fd, render_fd, display_fd);
--- a/src/c11/impl/threads_posix.c
+++ b/src/c11/impl/threads_posix.c
@ -46,12 +46,13 @@ impl_thrd_routine(void *p)

 /*--------------- 7.25.2 Initialization functions ---------------*/
 // 7.25.2.1
+#ifndef __once_flag_defined
 void
 call_once(once_flag *flag, void (*func)(void))
 {
    pthread_once(flag, func);
 }
-
+#endif

 /*------------- 7.25.3 Condition variable functions -------------*/
 // 7.25.3.1
--- a/src/c11/threads.h
+++ b/src/c11/threads.h
@ -118,8 +118,10 @@ typedef pthread_cond_t  cnd_t;
 typedef pthread_t       thrd_t;
 typedef pthread_key_t   tss_t;
 typedef pthread_mutex_t mtx_t;
+#ifndef __once_flag_defined
 typedef pthread_once_t  once_flag;
 #  define ONCE_FLAG_INIT PTHREAD_ONCE_INIT
+#endif
 #  ifdef PTHREAD_DESTRUCTOR_ITERATIONS
 #    define TSS_DTOR_ITERATIONS PTHREAD_DESTRUCTOR_ITERATIONS
 #  else
--- a/src/compiler/glsl/gl_nir_link_uniform_blocks.c
+++ b/src/compiler/glsl/gl_nir_link_uniform_blocks.c
@ -139,6 +139,7 @@ struct link_uniform_block_active {
   bool has_instance_name;
   bool has_binding;
   bool is_shader_storage;
+   bool block_index_assigned;
 };

 /*
@ -1197,14 +1198,32 @@ link_linked_shader_uniform_blocks(void *mem_ctx,


   if (!prog->data->spirv) {
-      hash_table_foreach(block_hash, entry) {
+      /* Assign block indices in the order they appear in the shader. We could
+       * just loop over the hash table and this would be spec compiliant
+       * however some games seem to incorrectly assume they know the correct
+       * index without checking. So to avoid debugging strange issues anytime
+       * the hash table is modified and the order changes we use this
+       * predictable index allocation instead.
+       */
+      nir_foreach_variable_in_shader(var, shader->Program->nir) {
+         if (block_type == BLOCK_UBO && !nir_variable_is_in_ubo(var))
+            continue;
+
+         if (block_type == BLOCK_SSBO && !nir_variable_is_in_ssbo(var))
+            continue;
+
+         const struct hash_entry *entry =
+            _mesa_hash_table_search(block_hash,
+                                    glsl_get_type_name(var->interface_type));
+
         struct link_uniform_block_active *const b =
            (struct link_uniform_block_active *) entry->data;
+         if (b->block_index_assigned)
+            continue;

         const struct glsl_type *blk_type =
            glsl_without_array(b->var->type) == b->var->interface_type ?
               b->var->type : b->var->interface_type;
-
         if (glsl_type_is_array(blk_type)) {
             char *name =
               ralloc_strdup(NULL,
@ -1221,6 +1240,7 @@ link_linked_shader_uniform_blocks(void *mem_ctx,
                       variables, &variable_index, 0, 0, prog, shader->Stage,
                       block_type);
         }
+         b->block_index_assigned = true;
      }
   } else {
      nir_foreach_variable_in_shader(var, shader->Program->nir) {
--- a/src/compiler/glsl/glcpp/meson.build
+++ b/src/compiler/glsl/glcpp/meson.build
@ -28,10 +28,16 @@ glcpp_lex = custom_target(
  command : [prog_flex, '-o', '@OUTPUT@', '@INPUT@'],
 )

+glcpp_header_gen_deps = declare_dependency(
+  sources : [
+    glcpp_parse[1],
+  ],
+)
+
 libglcpp = static_library(
  'glcpp',
  [glcpp_lex, glcpp_parse, files('glcpp.h', 'pp.c')],
-  dependencies : idep_mesautil,
+  dependencies : [idep_mesautil, glcpp_header_gen_deps],
  include_directories : [inc_include, inc_src, inc_mesa, inc_gallium, inc_gallium_aux],
  c_args : [no_override_init_args, c_msvc_compat_args],
  cpp_args : [cpp_msvc_compat_args],
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@ -2878,12 +2878,25 @@ nir_visitor::visit(ir_dereference_array *ir)
 void
 nir_visitor::visit(ir_barrier *)
 {
-   if (shader->info.stage == MESA_SHADER_COMPUTE) {
+   switch (shader->info.stage) {
+   case MESA_SHADER_COMPUTE:
      nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
-                      NIR_MEMORY_ACQ_REL, nir_var_mem_shared);
-   } else if (shader->info.stage == MESA_SHADER_TESS_CTRL) {
+                  NIR_MEMORY_ACQ_REL, nir_var_mem_shared);
+      break;
+   case MESA_SHADER_TESS_CTRL:
      nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP,
-                      NIR_MEMORY_ACQ_REL, nir_var_shader_out);
+                  NIR_MEMORY_ACQ_REL, nir_var_shader_out);
+      break;
+   case MESA_SHADER_TASK:
+      nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, NIR_MEMORY_ACQ_REL,
+                  nir_var_mem_task_payload | nir_var_mem_shared);
+      break;
+   case MESA_SHADER_MESH:
+      nir_barrier(&b, SCOPE_WORKGROUP, SCOPE_WORKGROUP, NIR_MEMORY_ACQ_REL,
+                  nir_var_shader_out | nir_var_mem_shared);
+      break;
+   default:
+      UNREACHABLE("barrier() not supported in this shader stage");
   }
 }

--- a/src/compiler/nir/nir_control_flow.c
+++ b/src/compiler/nir/nir_control_flow.c
@ -968,7 +968,7 @@ nir_sort_unstructured_blocks(nir_function_impl *impl)

   ralloc_free(blocks);

-   /* Dominance is toast but we indexed blocks as part of this pass. */
-   impl->valid_metadata &= nir_metadata_dominance;
+   /* Most metadata is toast but we indexed blocks as part of this pass. */
+   impl->valid_metadata &= nir_metadata_live_defs;
   impl->valid_metadata |= nir_metadata_block_index;
 }
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -224,6 +224,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_load_subgroup_id_shift_ir3:
   case nir_intrinsic_load_base_instance:
   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_raw_vertex_offset_pan:
   case nir_intrinsic_load_first_vertex:
   case nir_intrinsic_load_draw_id:
   case nir_intrinsic_load_is_indexed_draw:
@ -319,14 +320,10 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_load_base_global_invocation_id:
   case nir_intrinsic_load_base_workgroup_id:
   case nir_intrinsic_load_alpha_reference_amd:
-   case nir_intrinsic_load_ubo_uniform_block_intel:
-   case nir_intrinsic_load_ssbo_uniform_block_intel:
-   case nir_intrinsic_load_shared_uniform_block_intel:
   case nir_intrinsic_load_barycentric_optimize_amd:
   case nir_intrinsic_load_poly_line_smooth_enabled:
   case nir_intrinsic_load_rasterization_primitive_amd:
   case nir_intrinsic_unit_test_uniform_amd:
-   case nir_intrinsic_load_global_constant_uniform_block_intel:
   case nir_intrinsic_load_debug_log_desc_amd:
   case nir_intrinsic_load_xfb_state_address_gfx12_amd:
   case nir_intrinsic_cmat_length:
@ -364,6 +361,24 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
      is_divergent = false;
      break;

+   case nir_intrinsic_load_ubo_uniform_block_intel:
+   case nir_intrinsic_load_ssbo_uniform_block_intel:
+   case nir_intrinsic_load_shared_uniform_block_intel:
+   case nir_intrinsic_load_global_constant_uniform_block_intel:
+      if (options & (nir_divergence_across_subgroups |
+                     nir_divergence_multiple_workgroup_per_compute_subgroup)) {
+         unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
+         for (unsigned i = 0; i < num_srcs; i++) {
+            if (src_divergent(instr->src[i], state)) {
+               is_divergent = true;
+               break;
+            }
+         }
+      } else {
+         is_divergent = false;
+      }
+      break;
+
   /* This is divergent because it specifically loads sequential values into
    * successive SIMD lanes.
    */
@ -825,6 +840,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_load_sample_pos_or_center:
   case nir_intrinsic_load_vertex_id_zero_base:
   case nir_intrinsic_load_vertex_id:
+   case nir_intrinsic_load_raw_vertex_id_pan:
   case nir_intrinsic_load_invocation_id:
   case nir_intrinsic_load_local_invocation_id:
   case nir_intrinsic_load_local_invocation_index:
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@ -1069,6 +1069,7 @@ nir_get_io_index_src_number(const nir_intrinsic_instr *instr)
   IMG_CASE(atomic):
   IMG_CASE(atomic_swap):
   IMG_CASE(size):
+   IMG_CASE(levels):
   IMG_CASE(samples):
   IMG_CASE(texel_address):
   IMG_CASE(samples_identical):
--- a/src/compiler/nir/nir_lower_printf.c
+++ b/src/compiler/nir/nir_lower_printf.c
@ -76,24 +76,27 @@ lower_printf_intrin(nir_builder *b, nir_intrinsic_instr *prntf, void *_options)
       */
      assert(fmt_str_id - 1 < b->shader->printf_info_count && "must be in-bounds");

+      u_printf_singleton_add(&b->shader->printf_info[fmt_str_id - 1], 1);
      uint32_t hash = u_printf_hash(&b->shader->printf_info[fmt_str_id - 1]);
      fmt_str_id = hash;
   }

-   nir_deref_instr *args = nir_src_as_deref(prntf->src[0]);
-   assert(args->deref_type == nir_deref_type_var);
-
-   /* Atomic add a buffer size counter to determine where to write.  If
+   /* Atomic add a buffer size counter to determine where to write. If
    * overflowed, return -1, otherwise, store the arguments and return 0.
    */
   nir_deref_instr *buffer =
      nir_build_deref_cast(b, buffer_addr, nir_var_mem_global,
-                           glsl_array_type(glsl_uint8_t_type(), 0, 4), 0);
+                           glsl_array_type(glsl_uint8_t_type(), 0, 1), 0);

   /* Align the struct size to 4 */
-   assert(glsl_type_is_struct_or_ifc(args->type));
-   int args_size = align(glsl_get_cl_size(args->type), 4);
+   nir_deref_instr *args = nir_src_as_deref(prntf->src[0]);
+
+   int args_size = 0;
   int fmt_str_id_size = 4;
+   if (args != NULL) {
+      assert(glsl_type_is_struct_or_ifc(args->type));
+      args_size = align(glsl_get_cl_size(args->type), 4);
+   }

   /* Increment the counter at the beginning of the buffer */
   const unsigned counter_size = 4;
@ -130,22 +133,26 @@ lower_printf_intrin(nir_builder *b, nir_intrinsic_instr *prntf, void *_options)
   fmt_str_id_deref->cast.align_mul = 4;
   nir_store_deref(b, fmt_str_id_deref, nir_imm_int(b, fmt_str_id), ~0);

-   /* Write the format args */
-   for (unsigned i = 0; i < glsl_get_length(args->type); ++i) {
-      nir_deref_instr *arg_deref = nir_build_deref_struct(b, args, i);
-      nir_def *arg = nir_load_deref(b, arg_deref);
-      const struct glsl_type *arg_type = arg_deref->type;
+   if (args != NULL) {
+      assert(args->deref_type == nir_deref_type_var);

-      unsigned field_offset = glsl_get_struct_field_offset(args->type, i);
-      nir_def *arg_offset =
-         nir_iadd_imm(b, offset, fmt_str_id_size + field_offset);
-      nir_deref_instr *dst_arg_deref =
-         nir_build_deref_array(b, buffer, arg_offset);
-      dst_arg_deref = nir_build_deref_cast(b, &dst_arg_deref->def,
+      /* Write the format args */
+      for (unsigned i = 0; i < glsl_get_length(args->type); ++i) {
+         nir_deref_instr *arg_deref = nir_build_deref_struct(b, args, i);
+         nir_def *arg = nir_load_deref(b, arg_deref);
+         const struct glsl_type *arg_type = arg_deref->type;
+
+         unsigned field_offset = glsl_get_struct_field_offset(args->type, i);
+         nir_def *arg_offset =
+            nir_iadd_imm(b, offset, fmt_str_id_size + field_offset);
+         nir_deref_instr *dst_arg_deref =
+            nir_build_deref_array(b, buffer, arg_offset);
+         dst_arg_deref = nir_build_deref_cast(b, &dst_arg_deref->def,
                                           nir_var_mem_global, arg_type, 0);
-      assert(field_offset % 4 == 0);
-      dst_arg_deref->cast.align_mul = 4;
-      nir_store_deref(b, dst_arg_deref, arg, ~0);
+         assert(field_offset % 4 == 0);
+         dst_arg_deref->cast.align_mul = 4;
+         nir_store_deref(b, dst_arg_deref, arg, ~0);
+      }
   }

   nir_push_else(b, NULL);
--- a/src/compiler/nir/nir_lower_shader_calls.c
+++ b/src/compiler/nir/nir_lower_shader_calls.c
@ -1228,8 +1228,16 @@ wrap_instr(nir_builder *b, nir_instr *instr, void *data)
 static bool
 wrap_instrs(nir_shader *shader, wrap_instr_callback callback)
 {
-   return nir_shader_instructions_pass(shader, wrap_instr,
-                                       nir_metadata_none, callback);
+   bool progress = nir_shader_instructions_pass(shader, wrap_instr,
+                                                nir_metadata_none, callback);
+   /* Wrapping jump instructions that are located inside ifs can break SSA
+    * invariants because the else block no longer dominates the merge block.
+    * Repair the SSA to make the validator happy again.
+    */
+   if (progress)
+      nir_repair_ssa(shader);
+
+   return progress;
 }

 static bool
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -4096,9 +4096,9 @@ distribute_src_mods = [
   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),

-   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
+   (('fneg', ('ffma(is_used_once,nsz)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
-   (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
+   (('fneg', ('fadd(is_used_once,nsz)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),

   # Note that fmin <-> fmax.  I don't think there is a way to distribute
   # fabs() into fmin or fmax.
--- a/src/compiler/nir/nir_opt_large_constants.c
+++ b/src/compiler/nir/nir_opt_large_constants.c
@ -481,6 +481,10 @@ nir_opt_large_constants(nir_shader *shader,
      /* Fix up indices after we sorted. */
      info->var->index = i;

+      /* Don't bother with dead variables. */
+      if (info->constant_data_size == 0)
+         info->is_constant = false;
+
      if (!info->is_constant)
         continue;

@ -529,6 +533,17 @@ nir_opt_large_constants(nir_shader *shader,

   nir_foreach_block(block, impl) {
      nir_foreach_instr_safe(instr, block) {
+         if (instr->type == nir_instr_type_deref) {
+            /* Ensure all derefs accessing the lowered arrays get removed. */
+            nir_deref_instr *deref = nir_instr_as_deref(instr);
+            if (!nir_deref_mode_is(deref, nir_var_function_temp))
+               continue;
+
+            nir_variable *var = nir_deref_instr_get_variable(deref);
+            if (var && var_infos[var->index].is_constant)
+               nir_deref_instr_remove_if_unused(deref);
+         }
+
         if (instr->type != nir_instr_type_intrinsic)
            continue;

--- a/src/compiler/nir/nir_opt_shrink_stores.c
+++ b/src/compiler/nir/nir_opt_shrink_stores.c
@ -82,7 +82,9 @@ opt_shrink_store_instr(nir_builder *b, nir_intrinsic_instr *instr, bool shrink_i

   /* Trim the num_components stored according to the write mask. */
   unsigned write_mask = nir_intrinsic_write_mask(instr);
-   unsigned last_bit = util_last_bit(write_mask);
+   /* Don't trim down to an invalid number of components, though. */
+   unsigned last_bit = nir_round_up_components(util_last_bit(write_mask));
+
   if (last_bit < instr->num_components) {
      nir_def *def = nir_trim_vector(b, instr->src[0].ssa, last_bit);
      nir_src_rewrite(&instr->src[0], def);
--- a/src/compiler/nir/nir_precompiled.h
+++ b/src/compiler/nir/nir_precompiled.h
@ -652,6 +652,7 @@ nir_precompiled_build_variant(const nir_function *libfunc,

   assert(libfunc->workgroup_size[0] != 0 && "must set workgroup size");

+   b.shader->info.workgroup_size_variable = false;
   b.shader->info.workgroup_size[0] = libfunc->workgroup_size[0];
   b.shader->info.workgroup_size[1] = libfunc->workgroup_size[1];
   b.shader->info.workgroup_size[2] = libfunc->workgroup_size[2];
--- a/src/compiler/nir/nir_sweep.c
+++ b/src/compiler/nir/nir_sweep.c
@ -164,6 +164,12 @@ sweep_impl(nir_shader *nir, nir_function_impl *impl)

   /* Wipe out all the metadata, if any. */
   nir_progress(true, impl, nir_metadata_none);
+
+   /* These will be reallocated if needed. NULL them out so we don't
+    * use-after-free later.
+    */
+   impl->dom_lca_info.table.table = NULL;
+   impl->dom_lca_info.block_from_idx = NULL;
 }

 static void
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@ -827,6 +827,7 @@ vtn_handle_debug_printf(struct vtn_builder *b, SpvOp ext_opcode,

   struct vtn_value *format = vtn_value(b, w[5], vtn_value_type_string);

+   b->shader->info.uses_printf = true;
   b->shader->printf_info_count++;
   b->shader->printf_info = reralloc(b->shader,
                                     b->shader->printf_info,
@ -844,7 +845,7 @@ vtn_handle_debug_printf(struct vtn_builder *b, SpvOp ext_opcode,
      .string_size = strlen(format->str) + 1,
   };

-   uint32_t info_index = b->shader->printf_info_count - 1;
+   uint32_t info_index = b->shader->printf_info_count;

   if (argc) {
      glsl_struct_field *fields = calloc(argc, sizeof(glsl_struct_field));
@ -4791,22 +4792,30 @@ vtn_vector_construct(struct vtn_builder *b, unsigned num_components,
   return &vec->def;
 }

+/*
+ * Creates a copy of `src`, reinterpreting it as `dest_type`.
+ */
 static struct vtn_ssa_value *
-vtn_composite_copy(struct vtn_builder *b, struct vtn_ssa_value *src)
+vtn_composite_copy_logical(struct vtn_builder *b, struct vtn_ssa_value *src, struct vtn_type* dest_type)
 {
   assert(!src->is_variable);

   struct vtn_ssa_value *dest = vtn_zalloc(b, struct vtn_ssa_value);
-   dest->type = src->type;
+   dest->type = glsl_get_bare_type(dest_type->type);

-   if (glsl_type_is_vector_or_scalar(src->type)) {
+   if (glsl_type_is_vector_or_scalar(dest_type->type)) {
      dest->def = src->def;
   } else {
-      unsigned elems = glsl_get_length(src->type);
-
+      unsigned elems = glsl_get_length(dest_type->type);
      dest->elems = vtn_alloc_array(b, struct vtn_ssa_value *, elems);
-      for (unsigned i = 0; i < elems; i++)
-         dest->elems[i] = vtn_composite_copy(b, src->elems[i]);
+
+      if (glsl_type_is_struct(dest_type->type) || glsl_type_is_interface(dest_type->type)) {
+         for (unsigned i = 0; i < elems; i++)
+            dest->elems[i] = vtn_composite_copy_logical(b, src->elems[i], dest_type->members[i]);
+      } else {
+         for (unsigned i = 0; i < elems; i++)
+            dest->elems[i] = vtn_composite_copy_logical(b, src->elems[i], dest_type->array_element);
+      }
   }

   return dest;
@ -4814,13 +4823,14 @@ vtn_composite_copy(struct vtn_builder *b, struct vtn_ssa_value *src)

 static struct vtn_ssa_value *
 vtn_composite_insert(struct vtn_builder *b, struct vtn_ssa_value *src,
-                     struct vtn_ssa_value *insert, const uint32_t *indices,
-                     unsigned num_indices)
+                     struct vtn_type *src_type, struct vtn_ssa_value *insert,
+                     const uint32_t *indices, unsigned num_indices)
 {
   if (glsl_type_is_cmat(src->type))
      return vtn_cooperative_matrix_insert(b, src, insert, indices, num_indices);

-   struct vtn_ssa_value *dest = vtn_composite_copy(b, src);
+   /* Straight copy, use the source type as the destination type. */
+   struct vtn_ssa_value *dest = vtn_composite_copy_logical(b, src, src_type);

   struct vtn_ssa_value *cur = dest;
   unsigned i;
@ -4963,15 +4973,15 @@ vtn_handle_composite(struct vtn_builder *b, SpvOp opcode,

   case SpvOpCompositeInsert:
      ssa = vtn_composite_insert(b, vtn_ssa_value(b, w[4]),
+                                 vtn_get_value_type(b, w[4]),
                                 vtn_ssa_value(b, w[3]),
                                 w + 5, count - 5);
      break;

   case SpvOpCopyLogical: {
-      ssa = vtn_composite_copy(b, vtn_ssa_value(b, w[3]));
-      struct vtn_type *dst_type = vtn_get_value_type(b, w[2]);
-      vtn_assert(vtn_types_compatible(b, type, dst_type));
-      ssa->type = glsl_get_bare_type(dst_type->type);
+      struct vtn_type *dest_type = vtn_get_value_type(b, w[2]);
+      vtn_assert(vtn_types_compatible(b, vtn_get_value_type(b, w[3]), dest_type));
+      ssa = vtn_composite_copy_logical(b, vtn_ssa_value(b, w[3]), dest_type);
      break;
   }
   case SpvOpCopyObject:
--- a/src/compiler/spirv/vtn_bindgen2.c
+++ b/src/compiler/spirv/vtn_bindgen2.c
@ -329,6 +329,7 @@ main(int argc, char **argv)
   }

   glsl_type_singleton_init_or_ref();
+   u_printf_singleton_init_or_ref();

   for (unsigned i = 0; i < 2; ++i) {
      FILE *fp = i ? fp_c : fp_h;
@ -456,6 +457,7 @@ main(int argc, char **argv)
   fprintf(fp_c, "   static vtn_bindgen_dummy vtn_bindgen_dummy_instance;\n");
   fprintf(fp_c, "}\n");

+   u_printf_singleton_decref();
   glsl_type_singleton_decref();
   fclose(fp_c);
   fclose(fp_h);
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@ -506,8 +506,8 @@ vtn_pointer_dereference(struct vtn_builder *b,
            type = type->array_element;
         }
         tail = nir_build_deref_array(&b->nb, tail, arr_index);
+         tail->arr.in_bounds = deref_chain->in_bounds;
      }
-      tail->arr.in_bounds = deref_chain->in_bounds;

      access |= type->access;
   }
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@ -542,8 +542,9 @@ dri2_detect_swrast_kopper(_EGLDisplay *disp)
 {
   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);

+   /* Kopper won't work on Android without extra platform level support. */
   dri2_dpy->kopper = dri2_dpy->driver_name && !strcmp(dri2_dpy->driver_name, "zink") &&
-                      !debug_get_bool_option("LIBGL_KOPPER_DISABLE", false);
+                      !debug_get_bool_option("LIBGL_KOPPER_DISABLE", false) && disp->Platform != _EGL_PLATFORM_ANDROID;
   dri2_dpy->swrast = (disp->Options.ForceSoftware && !dri2_dpy->kopper && strcmp(dri2_dpy->driver_name, "vmwgfx")) ||
                      !dri2_dpy->driver_name || strstr(dri2_dpy->driver_name, "swrast");
   dri2_dpy->swrast_not_kms = dri2_dpy->swrast && (!dri2_dpy->driver_name || strcmp(dri2_dpy->driver_name, "kms_swrast"));
--- a/src/etnaviv/ci/gitlab-ci-inc.yml
+++ b/src/etnaviv/ci/gitlab-ci-inc.yml
@ -58,6 +58,7 @@
    FDO_CI_CONCURRENT: 2
  tags:
    - farm:$RUNNER_FARM_LOCATION
+    - ci-tron:priority:$CI_TRON_JOB_PRIORITY
    - cpu:cores:2
    - $VIVANTE_MODEL_TAG
    - $VIVANTE_REVISION_TAG
@ -70,6 +71,7 @@
    FDO_CI_CONCURRENT: 4
  tags:
    - farm:$RUNNER_FARM_LOCATION
+    - ci-tron:priority:$CI_TRON_JOB_PRIORITY
    - cpu:cores:4
    - $VIVANTE_MODEL_TAG
    - $VIVANTE_REVISION_TAG
@ -86,6 +88,7 @@
    GPU_VERSION: "etnaviv-gc3000-r5450"
  tags:
    - farm:$RUNNER_FARM_LOCATION
+    - ci-tron:priority:$CI_TRON_JOB_PRIORITY
    - $VIVANTE_MODEL_TAG
    - $VIVANTE_REVISION_TAG

@ -101,6 +104,7 @@
    GPU_VERSION: "etnaviv-gc7000-r6214"
  tags:
    - farm:$RUNNER_FARM_LOCATION
+    - ci-tron:priority:$CI_TRON_JOB_PRIORITY
    - $VIVANTE_MODEL_TAG
    - $VIVANTE_REVISION_TAG

@ -115,4 +119,3 @@
    BOOT_METHOD: barebox
    RUNNER_TAG: mesa-ci-x86-64-lava-imx8mp-tqma8mpql-mba8mpxl
    VISIBILITY_GROUP: "mesa-ci"
-
--- a/src/freedreno/fdl/fd6_tiled_memcpy.cc
+++ b/src/freedreno/fdl/fd6_tiled_memcpy.cc
@ -564,7 +564,7 @@ tiled_to_linear_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
            "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
   }
 #else
-   memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
+   memcpy_small<2, TILED_TO_LINEAR, FDL_MACROTILE_4_CHANNEL>(
      0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, 0, 0);
 #endif
 }
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@ -2300,6 +2300,17 @@ insert_live_out_moves(struct ra_ctx *ctx)
   insert_file_live_out_moves(ctx, &ctx->shared);
 }

+static bool
+has_merge_set_preferred_reg(struct ir3_register *reg)
+{
+   assert(reg->merge_set);
+   assert(reg->num != INVALID_REG);
+
+   return reg->merge_set->preferred_reg != (physreg_t)~0 &&
+          ra_reg_get_physreg(reg) ==
+             reg->merge_set->preferred_reg + reg->merge_set_offset;
+}
+
 static void
 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
 {
@ -2338,17 +2349,15 @@ handle_block(struct ra_ctx *ctx, struct ir3_block *block)
         struct ir3_register *dst = input->dsts[0];
         assert(dst->num != INVALID_REG);

-         physreg_t dst_start = ra_reg_get_physreg(dst);
         physreg_t dst_end;

-         if (dst->merge_set) {
+         if (dst->merge_set && has_merge_set_preferred_reg(dst)) {
            /* Take the whole merge set into account to prevent its range being
             * allocated for defs not part of the merge set.
             */
-            assert(dst_start >= dst->merge_set_offset);
-            dst_end = dst_start - dst->merge_set_offset + dst->merge_set->size;
+            dst_end = dst->merge_set->preferred_reg + dst->merge_set->size;
         } else {
-            dst_end = dst_start + reg_size(dst);
+            dst_end = ra_reg_get_physreg(dst) + reg_size(dst);
         }

         struct ra_file *file = ra_get_file(ctx, dst);
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@ -1461,6 +1461,15 @@ r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
   }

+   /* On a7xx we must always use FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8. See
+    * blit_base_format().
+    */
+   if (CHIP >= A7XX && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
+      RB_MRT_BUF_INFO = pkt_field_set(A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT,
+                                      RB_MRT_BUF_INFO,
+                                      FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
+   }
+
   tu_cs_emit_regs(cs,
                   RB_MRT_BUF_INFO(CHIP, 0, .dword = RB_MRT_BUF_INFO),
                   A6XX_RB_MRT_PITCH(0, 0),
@ -1533,7 +1542,8 @@ r3d_setup(struct tu_cmd_buffer *cmd,
      tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
   }

-   enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
+   enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, 
+                                                 blit_param & R3D_DST_GMEM);
   fixup_dst_format(src_format, &dst_format, &fmt);

   if (!cmd->state.pass) {
@ -4638,7 +4648,7 @@ clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
   enum pipe_format format = vk_format_to_pipe_format(vk_format);
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
   const struct tu_image_view *iview = cmd->state.attachments[a];
-   const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
+   const uint32_t clear_views = cmd->state.pass->attachments[a].used_views;
   const struct blit_ops *ops = &r2d_ops<CHIP>;
   const VkClearValue *value = &cmd->state.clear_values[a];
   if (cmd->state.pass->attachments[a].samples > 1)
@ -4734,7 +4744,7 @@ tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,

   tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, 0,
                                 cmd->state.framebuffer->layers,
-                                 attachment->clear_views,
+                                 attachment->used_views,
                                 attachment->clear_mask,
                                 &cmd->state.clear_values[a], NULL);
 }
@ -4755,7 +4765,7 @@ tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd,
                             iview->view.ubwc_enabled, att->samples);

   enum pipe_format format = vk_format_to_pipe_format(att->format);
-   for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
+   for_each_layer(i, att->used_views, cmd->state.framebuffer->layers) {
      uint32_t layer = i + 0;
      uint32_t mask =
         aspect_write_mask_generic_clear(format, att->clear_mask);
@ -4836,7 +4846,7 @@ tu_emit_blit(struct tu_cmd_buffer *cmd,
   uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, format);
   event_blit_setup(cs, buffer_id, attachment, blit_event_type, clear_mask);

-   for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
+   for_each_layer(i, attachment->used_views, cmd->state.framebuffer->layers) {
      event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i);
      event_blit_run<CHIP>(cmd, cs, attachment, &blt_view, separate_stencil);
   }
@ -4951,7 +4961,7 @@ load_3d_blit(struct tu_cmd_buffer *cmd,
   /* Wait for CACHE_INVALIDATE to land */
   tu_cs_emit_wfi(cs);

-   for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
+   for_each_layer(i, att->used_views, cmd->state.framebuffer->layers) {
      if (cmd->state.pass->has_fdm) {
         struct apply_load_coords_state state = {
            .view = i,
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@ -588,7 +588,7 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd,
   }

   u_foreach_bit (i, ~written) {
-      if (i >= subpass->color_count)
+      if (i >= MAX_RTS)
         break;

      /* From the VkPipelineRenderingCreateInfo definition:
@ -602,6 +602,10 @@ tu6_emit_mrt(struct tu_cmd_buffer *cmd,
       * here should prevent them from writing to anything. This also seems
       * to also be required for alpha-to-coverage which can use the alpha
       * value for an otherwise-unused attachment.
+       *
+       * With VK_EXT_dynamic_rendering_unused_attachments, pipelines may also
+       * write to attachments beyond those that exist in the render pass, so
+       * we have all attachments not written up to MAX_RTS.
       */
       tu_cs_emit_regs(cs,
         RB_MRT_BUF_INFO(CHIP, i),
@ -1616,7 +1620,7 @@ tu6_emit_gmem_stores(struct tu_cmd_buffer *cmd,
            scissor_emitted = true;
         }
         tu_store_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, a,
-                                  fb->layers, subpass->multiview_mask,
+                                  fb->layers, att->used_views,
                                  cond_exec_allowed);
      }
   }
@ -6868,7 +6872,7 @@ tu6_draw_common(struct tu_cmd_buffer *cmd,
   struct tu_render_pass_state *rp = &cmd->state.rp;

   trace_start_draw(
-      &cmd->trace, &cmd->draw_cs, cmd, draw_count,
+      &cmd->rp_trace, &cmd->draw_cs, cmd, draw_count,
      cmd->state.program.stage_sha1[MESA_SHADER_VERTEX],
      cmd->state.program.stage_sha1[MESA_SHADER_TESS_CTRL],
      cmd->state.program.stage_sha1[MESA_SHADER_TESS_EVAL],
@ -7316,7 +7320,7 @@ tu_CmdDraw(VkCommandBuffer commandBuffer,
   tu_cs_emit(cs, instanceCount);
   tu_cs_emit(cs, vertexCount);

-   trace_end_draw(&cmd->trace, cs);
+   trace_end_draw(&cmd->rp_trace, cs);
 }
 TU_GENX(tu_CmdDraw);

@ -7365,7 +7369,7 @@ tu_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
   }

   if (i != 0)
-      trace_end_draw(&cmd->trace, cs);
+      trace_end_draw(&cmd->rp_trace, cs);
 }
 TU_GENX(tu_CmdDrawMultiEXT);

@ -7393,7 +7397,7 @@ tu_CmdDrawIndexed(VkCommandBuffer commandBuffer,
   tu_cs_emit_qw(cs, cmd->state.index_va);
   tu_cs_emit(cs, cmd->state.max_index_count);

-   trace_end_draw(&cmd->trace, cs);
+   trace_end_draw(&cmd->rp_trace, cs);
 }
 TU_GENX(tu_CmdDrawIndexed);

@ -7447,7 +7451,7 @@ tu_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
   }

   if (i != 0)
-      trace_end_draw(&cmd->trace, cs);
+      trace_end_draw(&cmd->rp_trace, cs);
 }
 TU_GENX(tu_CmdDrawMultiIndexedEXT);

@ -7492,7 +7496,7 @@ tu_CmdDrawIndirect(VkCommandBuffer commandBuffer,
   tu_cs_emit_qw(cs, vk_buffer_address(&buf->vk, offset));
   tu_cs_emit(cs, stride);

-   trace_end_draw(&cmd->trace, cs);
+   trace_end_draw(&cmd->rp_trace, cs);
 }
 TU_GENX(tu_CmdDrawIndirect);

@ -7525,7 +7529,7 @@ tu_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
   tu_cs_emit_qw(cs, vk_buffer_address(&buf->vk, offset));
   tu_cs_emit(cs, stride);

-   trace_end_draw(&cmd->trace, cs);
+   trace_end_draw(&cmd->rp_trace, cs);
 }
 TU_GENX(tu_CmdDrawIndexedIndirect);

@ -7564,7 +7568,7 @@ tu_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,
   tu_cs_emit_qw(cs, vk_buffer_address(&count_buf->vk, countBufferOffset));
   tu_cs_emit(cs, stride);

-   trace_end_draw(&cmd->trace, cs);
+   trace_end_draw(&cmd->rp_trace, cs);
 }
 TU_GENX(tu_CmdDrawIndirectCount);

@ -7600,7 +7604,7 @@ tu_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,
   tu_cs_emit_qw(cs, vk_buffer_address(&count_buf->vk, countBufferOffset));
   tu_cs_emit(cs, stride);

-   trace_end_draw(&cmd->trace, cs);
+   trace_end_draw(&cmd->rp_trace, cs);
 }
 TU_GENX(tu_CmdDrawIndexedIndirectCount);

@ -7644,7 +7648,7 @@ tu_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
   tu_cs_emit(cs, counterOffset);
   tu_cs_emit(cs, vertexStride);

-   trace_end_draw(&cmd->trace, cs);
+   trace_end_draw(&cmd->rp_trace, cs);
 }
 TU_GENX(tu_CmdDrawIndirectByteCountEXT);

--- a/src/freedreno/vulkan/tu_descriptor_set.cc
+++ b/src/freedreno/vulkan/tu_descriptor_set.cc
@ -208,8 +208,8 @@ tu_CreateDescriptorSetLayout(
      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
         set_layout->has_inline_uniforms = true;

-      if (variable_flags && binding->binding < variable_flags->bindingCount &&
-          (variable_flags->pBindingFlags[binding->binding] &
+      if (variable_flags && j < variable_flags->bindingCount &&
+          (variable_flags->pBindingFlags[j] &
           VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
         assert(!binding->pImmutableSamplers); /* Terribly ill defined  how
                                                  many samplers are valid */
@ -377,7 +377,7 @@ tu_GetDescriptorSetLayoutSupport(
      uint64_t max_count = MAX_SET_SIZE;
      unsigned descriptor_count = binding->descriptorCount;
      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
-         max_count = MAX_SET_SIZE - size;
+         max_count = MAX_INLINE_UBO_RANGE - size;
         descriptor_count = descriptor_sz;
         descriptor_sz = 1;
      } else if (descriptor_sz) {
@ -388,9 +388,9 @@ tu_GetDescriptorSetLayoutSupport(
         supported = false;
      }

-      if (variable_flags && binding->binding < variable_flags->bindingCount &&
+      if (variable_flags && i < variable_flags->bindingCount &&
          variable_count &&
-          (variable_flags->pBindingFlags[binding->binding] &
+          (variable_flags->pBindingFlags[i] &
           VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
         variable_count->maxVariableDescriptorCount =
            MIN2(UINT32_MAX, max_count);
--- a/src/freedreno/vulkan/tu_pass.cc
+++ b/src/freedreno/vulkan/tu_pass.cc
@ -417,7 +417,8 @@ tu_render_pass_patch_input_gmem(struct tu_render_pass *pass)
         uint32_t a = subpass->input_attachments[j].attachment;
         if (a == VK_ATTACHMENT_UNUSED)
            continue;
-         subpass->input_attachments[j].patch_input_gmem = written[a];
+         subpass->input_attachments[j].patch_input_gmem =
+            written[a] && pass->attachments[a].gmem;
      }

      for (unsigned j = 0; j < subpass->color_count; j++) {
@ -884,7 +885,7 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const

   att->gmem = true;
   update_samples(subpass, pCreateInfo->pAttachments[a].samples);
-   att->clear_views |= subpass->multiview_mask;
+   att->used_views |= subpass->multiview_mask;

   /* Loads and clears are emitted at the start of the subpass that needs them. */
   att->first_subpass_idx = MIN2(i, att->first_subpass_idx);
@ -1126,6 +1127,7 @@ tu_CreateRenderPass2(VkDevice _device,
      if (!att->gmem) {
         att->clear_mask = 0;
         att->load = false;
+         att->load_stencil = false;
      }
   }

@ -1235,7 +1237,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
      VK_FROM_HANDLE(tu_image_view, view, att_info->imageView);
      tu_setup_dynamic_attachment(att, view);
      att->gmem = true;
-      att->clear_views = info->viewMask;
+      att->used_views = info->viewMask;
      attachment_set_ops(device, att, att_info->loadOp,
                         VK_ATTACHMENT_LOAD_OP_DONT_CARE, att_info->storeOp,
                         VK_ATTACHMENT_STORE_OP_DONT_CARE);
@ -1279,7 +1281,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
         struct tu_render_pass_attachment *att = &pass->attachments[a];
         tu_setup_dynamic_attachment(att, view);
         att->gmem = true;
-         att->clear_views = info->viewMask;
+         att->used_views = info->viewMask;
         subpass->depth_stencil_attachment.attachment = a++;
         subpass->input_attachments[0].attachment =
            subpass->depth_stencil_attachment.attachment;
--- a/src/freedreno/vulkan/tu_pass.h
+++ b/src/freedreno/vulkan/tu_pass.h
@ -94,7 +94,19 @@ struct tu_render_pass_attachment
   VkSampleCountFlagBits samples;
   uint32_t cpp;
   VkImageAspectFlags clear_mask;
-   uint32_t clear_views;
+
+   /* All views that are used with the attachment in all subpasses. Used to
+    * determine which views to apply loadOp/storeOp to.
+    */
+   uint32_t used_views;
+   /* The internal MSRTSS attachment to clear when the user says to clear
+    * this attachment. Clear values must be remapped to this attachment.
+    */
+   uint32_t remapped_clear_att;
+   /* For internal attachments created for MSRTSS, the original user attachment
+    * which it is resolved/unresolved to.
+    */
+   uint32_t user_att;
   bool load;
   bool store;
   bool gmem;
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@ -3157,8 +3157,6 @@ tu6_emit_blend(struct tu_cs *cs,

   bool dual_src_blend = tu_blend_state_is_dual_src(cb);

-   tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_rts));
-   tu_cs_emit_regs(cs, A6XX_RB_PS_MRT_CNTL(.mrt = num_rts));
   tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
                                          .unk8 = true,
                                          .dual_color_in_enable =
@ -3180,10 +3178,12 @@ tu6_emit_blend(struct tu_cs *cs,
                                          .alpha_to_one = alpha_to_one_enable,
                                          .sample_mask = sample_mask));

+   unsigned num_remapped_rts = 0;
   for (unsigned i = 0; i < num_rts; i++) {
      if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
         continue;
      unsigned remapped_idx = cal->color_map[i];
+      num_remapped_rts = MAX2(num_remapped_rts, remapped_idx + 1);
      const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
      if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
         const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
@ -3227,6 +3227,8 @@ tu6_emit_blend(struct tu_cs *cs,
                            A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
      }
   }
+   tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_remapped_rts));
+   tu_cs_emit_regs(cs, A6XX_RB_PS_MRT_CNTL(.mrt = num_remapped_rts));
 }

 static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
@ -4813,6 +4815,7 @@ fail:
   if (shader)
      vk_pipeline_cache_object_unref(&dev->vk, &shader->base);

+   ralloc_free(pipeline->base.executables_mem_ctx);
   ralloc_free(pipeline_mem_ctx);

   vk_object_free(&dev->vk, pAllocator, pipeline);
--- a/src/gallium/auxiliary/driver_noop/noop_state.c
+++ b/src/gallium/auxiliary/driver_noop/noop_state.c
@ -185,12 +185,6 @@ static void noop_set_vertex_buffers(struct pipe_context *ctx,
                                    unsigned count,
                                    const struct pipe_vertex_buffer *buffers)
 {
-   for (unsigned i = 0; i < count; i++) {
-      if (!buffers[i].is_user_buffer) {
-         struct pipe_resource *buf = buffers[i].buffer.resource;
-         pipe_resource_reference(&buf, NULL);
-      }
-   }
 }

 static void *noop_create_vertex_elements(struct pipe_context *ctx,
--- a/src/gallium/auxiliary/gallivm/lp_bld.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld.h
@ -88,10 +88,9 @@
 #define LLVMCreateBuilder ILLEGAL_LLVM_FUNCTION

 typedef struct lp_context_ref {
-#if GALLIVM_USE_ORCJIT
-   LLVMOrcThreadSafeContextRef ref;
-#else
   LLVMContextRef ref;
+#if GALLIVM_USE_ORCJIT
+   LLVMOrcThreadSafeContextRef tsref;
 #endif
   bool owned;
 } lp_context_ref;
@ -101,18 +100,21 @@ lp_context_create(lp_context_ref *context)
 {
   assert(context != NULL);
 #if GALLIVM_USE_ORCJIT
-   context->ref = LLVMOrcCreateNewThreadSafeContext();
+#if LLVM_VERSION_MAJOR >= 21
+   context->ref = LLVMContextCreate();
+   /* Ownership of ref is then transferred to tsref */
+   context->tsref = LLVMOrcCreateNewThreadSafeContextFromLLVMContext(context->ref);
+#else
+   context->tsref = LLVMOrcCreateNewThreadSafeContext();
+   context->ref = LLVMOrcThreadSafeContextGetContext(context->tsref);
+#endif
 #else
   context->ref = LLVMContextCreate();
 #endif
   context->owned = true;
 #if LLVM_VERSION_MAJOR == 15
   if (context->ref) {
-#if GALLIVM_USE_ORCJIT
-      LLVMContextSetOpaquePointers(LLVMOrcThreadSafeContextGetContext(context->ref), false);
-#else
      LLVMContextSetOpaquePointers(context->ref, false);
-#endif
   }
 #endif
 }
@ -123,7 +125,7 @@ lp_context_destroy(lp_context_ref *context)
   assert(context != NULL);
   if (context->owned) {
 #if GALLIVM_USE_ORCJIT
-      LLVMOrcDisposeThreadSafeContext(context->ref);
+      LLVMOrcDisposeThreadSafeContext(context->tsref);
 #else
      LLVMContextDispose(context->ref);
 #endif
--- a/src/gallium/auxiliary/gallivm/lp_bld_init_orc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init_orc.cpp
@ -555,8 +555,8 @@ init_gallivm_state(struct gallivm_state *gallivm, const char *name,

   gallivm->cache = cache;

-   gallivm->_ts_context = context->ref;
-   gallivm->context = LLVMContextCreate();
+   gallivm->_ts_context = context->tsref;
+   gallivm->context = context->ref;

   gallivm->module_name = LPJit::get_unique_name(name);
   gallivm->module = LLVMModuleCreateWithNameInContext(gallivm->module_name,
--- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
@ -3163,7 +3163,7 @@ do_int_divide(struct lp_build_nir_soa_context *bld,

 static LLVMValueRef
 do_int_mod(struct lp_build_nir_soa_context *bld,
-           bool is_unsigned, unsigned src_bit_size,
+           bool is_unsigned, bool use_src2_sign, unsigned src_bit_size,
           LLVMValueRef src, LLVMValueRef src2)
 {
   struct gallivm_state *gallivm = bld->base.gallivm;
@ -3180,8 +3180,18 @@ do_int_mod(struct lp_build_nir_soa_context *bld,
      divisor = get_signed_divisor(gallivm, int_bld, mask_bld,
                                   src_bit_size, src, divisor);
   }
-   LLVMValueRef result = lp_build_mod(int_bld, src, divisor);
-   return LLVMBuildOr(builder, div_mask, result, "");
+   LLVMValueRef rem = lp_build_mod(int_bld, src, divisor);
+   rem = LLVMBuildOr(builder, div_mask, rem, "");
+
+   if (use_src2_sign) {
+      LLVMValueRef add_src2 = LLVMBuildICmp(builder, LLVMIntNE, rem, int_bld->zero, "");
+      LLVMValueRef signs_different = LLVMBuildXor(builder, LLVMBuildICmp(builder, LLVMIntSLT, src, int_bld->zero, ""),
+                                                  LLVMBuildICmp(builder, LLVMIntSLT, src2, int_bld->zero, ""), "");
+      add_src2 = LLVMBuildAnd(builder, add_src2, signs_different, "");
+      rem = LLVMBuildSelect(builder, add_src2, LLVMBuildAdd(builder, rem, src2, ""), rem, "");
+   }
+
+   return rem;
 }

 static LLVMValueRef
@ -3493,7 +3503,7 @@ do_alu_action(struct lp_build_nir_soa_context *bld,
      break;
   case nir_op_imod:
   case nir_op_irem:
-      result = do_int_mod(bld, false, src_bit_size[0], src[0], src[1]);
+      result = do_int_mod(bld, false, instr->op == nir_op_imod, src_bit_size[0], src[0], src[1]);
      break;
   case nir_op_ishl: {
      if (src_bit_size[0] == 64)
@ -3592,7 +3602,7 @@ do_alu_action(struct lp_build_nir_soa_context *bld,
      result = lp_build_min(uint_bld, src[0], src[1]);
      break;
   case nir_op_umod:
-      result = do_int_mod(bld, true, src_bit_size[0], src[0], src[1]);
+      result = do_int_mod(bld, true, false, src_bit_size[0], src[0], src[1]);
      break;
   case nir_op_umul_high: {
      LLVMValueRef hi_bits;
--- a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
+++ b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
@ -64,6 +64,7 @@ DRI_CONF_SECTION_END
 DRI_CONF_SECTION_MISCELLANEOUS
   DRI_CONF_ALWAYS_HAVE_DEPTH_BUFFER(false)
   DRI_CONF_GLSL_ZERO_INIT(false)
+   DRI_CONF_VERTEX_PROGRAM_DEFAULT_OUT(false)
   DRI_CONF_VS_POSITION_ALWAYS_INVARIANT(false)
   DRI_CONF_VS_POSITION_ALWAYS_PRECISE(false)
   DRI_CONF_ALLOW_RGB10_CONFIGS(true)
--- a/src/gallium/auxiliary/util/u_driconf.c
+++ b/src/gallium/auxiliary/util/u_driconf.c
@ -76,6 +76,7 @@ u_driconf_fill_st_options(struct st_config_options *options,
   query_string_option(force_gl_renderer);
   query_string_option(mesa_extension_override);
   query_bool_option(allow_multisampled_copyteximage);
+   query_bool_option(vertex_program_default_out);

   driComputeOptionsSha1(optionCache, options->config_options_sha1);
 }
--- a/Show more
+++ b/Show more