VERSION: bump for 25.3.0-rc4

Signed-off-by: Dylan Baker <dylan.c.baker@intel.com>
radv: do not report wave32 in gl_SubgroupSize for Doom Dark Ages
2025-12-29 03:40:10 +01:00 · 2025-11-06 09:02:18 -08:00 · 2025-11-05 10:18:26 -08:00 · 2025-11-05 10:16:55 -08:00 · 2025-11-04 14:12:21 -08:00 · 2025-11-04 14:12:20 -08:00
147 changed files with 9140 additions and 1853 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@ -1 +1 @@
-25.3.0-devel
+25.3.0-rc4
--- a/docs/ci/LAVA.rst
+++ b/docs/ci/LAVA.rst
@ -122,9 +122,8 @@ Enable the site and restart nginx:
   # Second download should be cached.
   wget http://localhost/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public/itoral-gl-terrain-demo/demo-v2.trace

-Now, set ``download-url`` in your ``traces-*.yml`` entry to something like
-``http://caching-proxy/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public``
-and you should have cached downloads for traces.  Add it to
-``FDO_HTTP_CACHE_URI=`` in your ``config.toml`` runner environment lines and you
-can use it for cached artifact downloads instead of going all the way to
-freedesktop.org on each job.
+The trace runner script automatically sets the caching proxy, so there's no
+need to modify anything in the Mesa CI YAML files.
+Add ``LAVA_HTTP_CACHE_URI=http://localhost/cache/?uri=`` to your ``config.toml``
+runner environment lines and you can use it for cached artifact downloads
+instead of going all the way to freedesktop.org on each job.
--- a/include/drm-uapi/amdgpu_drm.h
+++ b/include/drm-uapi/amdgpu_drm.h
@ -1489,8 +1489,6 @@ struct drm_amdgpu_info_hw_ip {
 	__u32  available_rings;
 	/** version info: bits 23:16 major, 15:8 minor, 7:0 revision */
 	__u32  ip_discovery_version;
-	/* Userq available slots */
-	__u32  userq_num_slots;
 };

 /* GFX metadata BO sizes and alignment info (in bytes) */
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -315,8 +315,6 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
         info->ip[ip_type].num_queues = 1;
      } else if (ip_info.available_rings) {
         info->ip[ip_type].num_queues = util_bitcount(ip_info.available_rings);
-      } else if (ip_info.userq_num_slots) {
-         info->ip[ip_type].num_queue_slots = ip_info.userq_num_slots;
      } else {
         continue;
      }
@ -1696,11 +1694,11 @@ void ac_print_gpu_info(const struct radeon_info *info, FILE *f)
   fprintf(f, "    clock_crystal_freq = %i KHz\n", info->clock_crystal_freq);

   for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) {
-      if (info->ip[i].num_queues || info->ip[i].num_queue_slots) {
-         fprintf(f, "    IP %-7s %2u.%u \tqueues:%u \tqueue_slots:%u \talign:%u \tpad_dw:0x%x\n",
+      if (info->ip[i].num_queues) {
+         fprintf(f, "    IP %-7s %2u.%u \tqueues:%u \talign:%u \tpad_dw:0x%x\n",
                 ac_get_ip_type_string(info, i),
                 info->ip[i].ver_major, info->ip[i].ver_minor, info->ip[i].num_queues,
-                 info->ip[i].num_queue_slots,info->ip[i].ib_alignment, info->ip[i].ib_pad_dw_mask);
+                 info->ip[i].ib_alignment, info->ip[i].ib_pad_dw_mask);
      }
   }

--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@ -26,7 +26,6 @@ struct amd_ip_info {
   uint8_t ver_minor;
   uint8_t ver_rev;
   uint8_t num_queues;
-   uint8_t num_queue_slots;
   uint8_t num_instances;
   uint32_t ib_alignment;
   uint32_t ib_pad_dw_mask;
--- a/src/amd/common/ac_linux_drm.h
+++ b/src/amd/common/ac_linux_drm.h
@ -194,7 +194,6 @@ struct drm_amdgpu_info_hw_ip {
   uint32_t ib_size_alignment;
   uint32_t available_rings;
   uint32_t ip_discovery_version;
-   uint32_t userq_num_slots;
 };

 struct drm_amdgpu_info_uq_fw_areas_gfx {
--- a/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c
+++ b/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c
@ -109,23 +109,37 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
   nir_mem_access_size_align res;

   if (intrin == nir_intrinsic_load_shared || intrin == nir_intrinsic_store_shared) {
-      /* Split unsupported shared access. */
-      res.bit_size = MIN2(bit_size, combined_align * 8ull);
-      res.align = res.bit_size / 8;
      /* Don't use >64-bit LDS loads for performance reasons. */
      unsigned max_bytes = intrin == nir_intrinsic_store_shared && cb_data->gfx_level >= GFX7 ? 16 : 8;
      bytes = MIN3(bytes, combined_align, max_bytes);
      bytes = bytes == 12 ? bytes : round_down_to_power_of_2(bytes);
+
+      /* Split unsupported shared access. */
+      res.bit_size = MIN2(bit_size, bytes * 8ull);
+      res.align = res.bit_size / 8;
      res.num_components = bytes / res.align;
      res.shift = nir_mem_access_shift_method_bytealign_amd;
      return res;
   }

+   const bool is_buffer_load = intrin == nir_intrinsic_load_ubo ||
+                               intrin == nir_intrinsic_load_ssbo ||
+                               intrin == nir_intrinsic_load_constant;
+
   if (is_smem) {
+      const bool supported_subdword = cb_data->gfx_level >= GFX12 &&
+                                      intrin != nir_intrinsic_load_push_constant &&
+                                      (!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo);
+
      /* Round up subdword loads if unsupported. */
-      const bool supported_subdword = cb_data->gfx_level >= GFX12 && intrin != nir_intrinsic_load_push_constant;
-      if (bit_size < 32 && (bytes >= 3 || !supported_subdword))
+      if (bytes <= 2 && combined_align % bytes == 0 && supported_subdword) {
+         bit_size = bytes * 8;
+      } else if (bytes % 4 || combined_align % 4) {
+         if (is_buffer_load)
+            bytes += 4 - MIN2(combined_align, 4);
         bytes = align(bytes, 4);
+         bit_size = 32;
+      }

      /* Generally, require an alignment of 4. */
      res.align = MIN2(4, bytes);
@ -138,9 +152,6 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
      if (!util_is_power_of_two_nonzero(bytes) && (cb_data->gfx_level < GFX12 || bytes != 12)) {
         const uint8_t larger = util_next_power_of_two(bytes);
         const uint8_t smaller = larger / 2;
-         const bool is_buffer_load = intrin == nir_intrinsic_load_ubo ||
-                                     intrin == nir_intrinsic_load_ssbo ||
-                                     intrin == nir_intrinsic_load_constant;
         const bool is_aligned = align_mul % smaller == 0;

         /* Overfetch up to 1 dword if this is a bounds-checked buffer load or the access is aligned. */
@ -185,8 +196,8 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui

   const uint32_t max_pad = 4 - MIN2(combined_align, 4);

-   /* Global loads don't have bounds checking, so increasing the size might not be safe. */
-   if (intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_load_global_constant) {
+   /* Global/scratch loads don't have bounds checking, so increasing the size might not be safe. */
+   if (!is_buffer_load) {
      if (align_mul < 4) {
         /* If we split the load, only lower it to 32-bit if this is a SMEM load. */
         const unsigned chunk_bytes = align(bytes, 4) - max_pad;
--- a/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
@ -508,6 +508,8 @@ lower_ms_intrinsic(nir_builder *b, nir_instr *instr, void *state)
      return update_ms_barrier(b, intrin, s);
   case nir_intrinsic_load_workgroup_index:
      return lower_ms_load_workgroup_index(b, intrin, s);
+   case nir_intrinsic_load_num_subgroups:
+      return nir_imm_int(b, DIV_ROUND_UP(s->api_workgroup_size, s->wave_size));
   case nir_intrinsic_set_vertex_and_primitive_count:
      return lower_ms_set_vertex_and_primitive_count(b, intrin, s);
   default:
@ -529,6 +531,7 @@ filter_ms_intrinsic(const nir_instr *instr,
          intrin->intrinsic == nir_intrinsic_store_per_primitive_output ||
          intrin->intrinsic == nir_intrinsic_barrier ||
          intrin->intrinsic == nir_intrinsic_load_workgroup_index ||
+          intrin->intrinsic == nir_intrinsic_load_num_subgroups ||
          intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count;
 }

--- a/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp
@ -214,6 +214,8 @@ select_rt_prolog(Program* program, ac_shader_config* config,
   bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
            Operand::c32_or_c64(-1u, program->wave_size == 64),
            Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
+   bld.sop2(aco_opcode::s_cselect_b32, Definition(out_launch_size_y, s1),
+            Operand(out_launch_size_y, s1), Operand::c32(1), Operand(scc, s1));
   bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
            Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
   bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
--- a/src/amd/drm-shim/amdgpu_devices.c
+++ b/src/amd/drm-shim/amdgpu_devices.c
@ -1329,7 +1329,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0x1,
         .ip_discovery_version = 0xb0000,
-         .userq_num_slots = 2,
      },
      .hw_ip_compute = {
         .hw_ip_version_major = 11,
@ -1339,7 +1338,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0xf,
         .ip_discovery_version = 0xb0000,
-         .userq_num_slots = 16,
      },
      .fw_gfx_me = {
         .ver = 1486,
@ -1460,7 +1458,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0x1,
         .ip_discovery_version = 0xb0002,
-         .userq_num_slots = 0x0,
      },
      .hw_ip_compute = {
         .hw_ip_version_major = 11,
@ -1470,7 +1467,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0xf,
         .ip_discovery_version = 0xb0002,
-         .userq_num_slots = 0x0,
      },
      .fw_gfx_me = {
         .ver = 2390,
@ -2070,7 +2066,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0x1,
         .ip_discovery_version = 0xb0500,
-         .userq_num_slots = 2,
      },
      .hw_ip_compute = {
         .hw_ip_version_major = 11,
@ -2080,7 +2075,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0xf,
         .ip_discovery_version = 0xb0500,
-         .userq_num_slots = 16,
      },
      .fw_gfx_me = {
         .ver = 29,
@ -2201,7 +2195,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0x1,
         .ip_discovery_version = 0xc0001,
-         .userq_num_slots = 8,
      },
      .hw_ip_compute = {
         .hw_ip_version_major = 12,
@ -2211,7 +2204,6 @@ const struct amdgpu_device amdgpu_devices[] = {
         .ib_size_alignment = 32,
         .available_rings = 0xf,
         .ip_discovery_version = 0xc0001,
-         .userq_num_slots = 8,
      },
      .fw_gfx_me = {
         .ver = 2590,
--- a/src/amd/drm-shim/amdgpu_dump_states.c
+++ b/src/amd/drm-shim/amdgpu_dump_states.c
@ -379,7 +379,6 @@ amdgpu_dump_hw_ips(int fd)
      printf("   .ib_size_alignment = %u,\n", info.ib_size_alignment);
      printf("   .available_rings = 0x%x,\n", info.available_rings);
      printf("   .ip_discovery_version = 0x%04x,\n", info.ip_discovery_version);
-      printf("   .userq_num_slots = 0x%x,\n", info.userq_num_slots);
      printf("},\n");
   }
 }
--- a/src/amd/vulkan/layers/radv_no_mans_sky.c
+++ b/src/amd/vulkan/layers/radv_no_mans_sky.c
@ -0,0 +1,35 @@
+/*
+ * Copyright © 2025 Valve Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "radv_device.h"
+#include "radv_entrypoints.h"
+#include "radv_image_view.h"
+
+VKAPI_ATTR VkResult VKAPI_CALL
+no_mans_sky_CreateImageView(VkDevice _device, const VkImageViewCreateInfo *pCreateInfo,
+                            const VkAllocationCallbacks *pAllocator, VkImageView *pView)
+{
+   VK_FROM_HANDLE(radv_device, device, _device);
+   VkResult result;
+
+   result = device->layer_dispatch.app.CreateImageView(_device, pCreateInfo, pAllocator, pView);
+   if (result != VK_SUCCESS)
+      return result;
+
+   VK_FROM_HANDLE(radv_image_view, iview, *pView);
+
+   if ((iview->vk.aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) &&
+       (iview->vk.usage &
+        (VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))) {
+      /* No Man's Sky creates descriptors with depth/stencil aspects (only when Intel XESS is
+       * enabled apparently). and this is illegal in Vulkan. Ignore them by using NULL descriptors
+       * to workaroud GPU hangs.
+       */
+      memset(&iview->descriptor, 0, sizeof(iview->descriptor));
+   }
+
+   return result;
+}
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@ -21,6 +21,7 @@ radv_entrypoints_gen_command += [
  '--device-prefix', 'metro_exodus',
  '--device-prefix', 'rage2',
  '--device-prefix', 'quantic_dream',
+  '--device-prefix', 'no_mans_sky',

  # Command buffer annotation layer entrypoints
  '--device-prefix', 'annotate',
@ -40,6 +41,7 @@ libradv_files = files(
  'layers/radv_metro_exodus.c',
  'layers/radv_rage2.c',
  'layers/radv_quantic_dream.c',
+  'layers/radv_no_mans_sky.c',
  'layers/radv_rmv_layer.c',
  'layers/radv_rra_layer.c',
  'layers/radv_sqtt_layer.c',
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -6111,6 +6111,13 @@ radv_emit_tess_domain_origin_state(struct radv_cmd_buffer *cmd_buffer)
   radeon_end();
 }

+static bool
+radv_is_dual_src_enabled(const struct radv_dynamic_state *dynamic_state)
+{
+   /* Dual-source blending must be ignored if blending isn't enabled for MRT0. */
+   return dynamic_state->blend_eq.mrt0_is_dual_src && !!(dynamic_state->color_blend_enable & 1u);
+}
+
 static struct radv_shader_part *
 lookup_ps_epilog(struct radv_cmd_buffer *cmd_buffer)
 {
@ -6144,7 +6151,7 @@ lookup_ps_epilog(struct radv_cmd_buffer *cmd_buffer)

   state.color_write_mask = d->color_write_mask;
   state.color_blend_enable = d->color_blend_enable;
-   state.mrt0_is_dual_src = d->blend_eq.mrt0_is_dual_src;
+   state.mrt0_is_dual_src = radv_is_dual_src_enabled(&cmd_buffer->state.dynamic);

   if (d->vk.ms.alpha_to_coverage_enable) {
      /* Select a color export format with alpha when alpha to coverage is enabled. */
@ -8114,6 +8121,8 @@ radv_mark_descriptors_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPo
   struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);

   descriptors_state->dirty |= descriptors_state->valid;
+   if (descriptors_state->dynamic_offset_count)
+      descriptors_state->dirty_dynamic = true;
 }

 static void
@ -8642,7 +8651,6 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline

      if (cmd_buffer->state.compute_pipeline == compute_pipeline)
         return;
-      radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);

      radv_bind_shader(cmd_buffer, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], MESA_SHADER_COMPUTE);

@ -8656,7 +8664,6 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline

      if (cmd_buffer->state.rt_pipeline == rt_pipeline)
         return;
-      radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);

      radv_bind_shader(cmd_buffer, rt_pipeline->base.base.shaders[MESA_SHADER_INTERSECTION], MESA_SHADER_INTERSECTION);
      radv_bind_rt_prolog(cmd_buffer, rt_pipeline->prolog);
@ -8690,7 +8697,6 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline

      if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
         return;
-      radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);

      radv_foreach_stage (
         stage, (cmd_buffer->state.active_stages | graphics_pipeline->active_stages) & RADV_GRAPHICS_STAGE_BITS) {
@ -8744,6 +8750,8 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
   cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].dynamic_offset_count = pipeline->dynamic_offset_count;
   cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].need_indirect_descriptors =
      pipeline->need_indirect_descriptors;
+
+   radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);
 }

 VKAPI_ATTR void VKAPI_CALL
@ -11688,7 +11696,7 @@ radv_emit_cb_render_state(struct radv_cmd_buffer *cmd_buffer)
   const struct radv_rendering_state *render = &cmd_buffer->state.render;
   const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
   unsigned cb_blend_control[MAX_RTS], sx_mrt_blend_opt[MAX_RTS];
-   const bool mrt0_is_dual_src = d->blend_eq.mrt0_is_dual_src;
+   const bool mrt0_is_dual_src = radv_is_dual_src_enabled(&cmd_buffer->state.dynamic);
   uint32_t cb_color_control = 0;

   const uint32_t cb_target_mask = d->color_write_enable & d->color_write_mask;
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@ -792,6 +792,8 @@ init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pd
      add_entrypoints(&b, &rage2_device_entrypoints, RADV_APP_DISPATCH_TABLE);
   } else if (!strcmp(instance->drirc.debug.app_layer, "quanticdream")) {
      add_entrypoints(&b, &quantic_dream_device_entrypoints, RADV_APP_DISPATCH_TABLE);
+   } else if (!strcmp(instance->drirc.debug.app_layer, "no_mans_sky")) {
+      add_entrypoints(&b, &no_mans_sky_device_entrypoints, RADV_APP_DISPATCH_TABLE);
   }

   if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
--- a/src/amd/vulkan/radv_instance.c
+++ b/src/amd/vulkan/radv_instance.c
@ -200,6 +200,7 @@ static const driOptionDescription radv_dri_options[] = {
      DRI_CONF_RADV_EMULATE_RT(false)
      DRI_CONF_RADV_ENABLE_FLOAT16_GFX8(false)
      DRI_CONF_RADV_COOPERATIVE_MATRIX2_NV(false)
+      DRI_CONF_RADV_NO_IMPLICIT_VARYING_SUBGROUP_SIZE(false)
   DRI_CONF_SECTION_END
 };
 // clang-format on
@ -236,6 +237,8 @@ radv_init_dri_debug_options(struct radv_instance *instance)
   drirc->debug.ssbo_non_uniform = driQueryOptionb(&drirc->options, "radv_ssbo_non_uniform");
   drirc->debug.tex_non_uniform = driQueryOptionb(&drirc->options, "radv_tex_non_uniform");
   drirc->debug.zero_vram = driQueryOptionb(&drirc->options, "radv_zero_vram");
+   drirc->debug.no_implicit_varying_subgroup_size =
+      driQueryOptionb(&drirc->options, "radv_no_implicit_varying_subgroup_size");
   drirc->debug.app_layer = driQueryOptionstr(&drirc->options, "radv_app_layer");

   drirc->debug.override_uniform_offset_alignment =
--- a/src/amd/vulkan/radv_instance.h
+++ b/src/amd/vulkan/radv_instance.h
@ -57,6 +57,7 @@ struct radv_drirc {
      bool ssbo_non_uniform;
      bool tex_non_uniform;
      bool zero_vram;
+      bool no_implicit_varying_subgroup_size;
      char *app_layer;
      int override_uniform_offset_alignment;
   } debug;
--- a/src/amd/vulkan/radv_physical_device.c
+++ b/src/amd/vulkan/radv_physical_device.c
@ -252,6 +252,7 @@ radv_physical_device_init_cache_key(struct radv_physical_device *pdev)
   key->use_llvm = pdev->use_llvm;
   key->use_ngg = pdev->use_ngg;
   key->use_ngg_culling = pdev->use_ngg_culling;
+   key->no_implicit_varying_subgroup_size = instance->drirc.debug.no_implicit_varying_subgroup_size;
 }

 static int
--- a/src/amd/vulkan/radv_physical_device.h
+++ b/src/amd/vulkan/radv_physical_device.h
@ -64,8 +64,9 @@ struct radv_physical_device_cache_key {
   uint32_t use_llvm : 1;
   uint32_t use_ngg : 1;
   uint32_t use_ngg_culling : 1;
+   uint32_t no_implicit_varying_subgroup_size : 1;

-   uint32_t reserved : 10;
+   uint32_t reserved : 9;
 };

 enum radv_video_enc_hw_ver {
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@ -2383,8 +2383,9 @@ radv_GetQueryPoolResults(VkDevice _device, VkQueryPool queryPool, uint32_t first
         break;
      }
      case VK_QUERY_TYPE_VIDEO_ENCODE_FEEDBACK_KHR: {
+         const bool write_memory = radv_video_write_memory_supported(pdev) == RADV_VIDEO_WRITE_MEMORY_SUPPORT_FULL;
         uint32_t *src32 = (uint32_t *)src;
-         uint32_t ready_idx = radv_video_write_memory_supported(pdev) ? RADV_ENC_FEEDBACK_STATUS_IDX : 1;
+         uint32_t ready_idx = write_memory ? RADV_ENC_FEEDBACK_STATUS_IDX : 1;
         uint32_t value;
         do {
            value = p_atomic_read(&src32[ready_idx]);
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -367,6 +367,10 @@ radv_shader_choose_subgroup_size(struct radv_device *device, nir_shader *nir,
      .requiredSubgroupSize = stage_key->subgroup_required_size * 32,
   };

+   /* Do not allow for the SPIR-V 1.6 varying subgroup size rules. */
+   if (pdev->cache_key.no_implicit_varying_subgroup_size)
+      spirv_version = 0x10000;
+
   vk_set_subgroup_size(&device->vk, nir, spirv_version, rss_info.requiredSubgroupSize ? &rss_info : NULL,
                        stage_key->subgroup_allow_varying, stage_key->subgroup_require_full);

--- a/src/amd/vulkan/radv_sqtt.c
+++ b/src/amd/vulkan/radv_sqtt.c
@ -508,7 +508,9 @@ radv_begin_sqtt(struct radv_queue *queue)
      device->sqtt.start_cs[family] = NULL;
   }

-   cs.b = ws->cs_create(ws, radv_queue_ring(queue), false);
+   radv_init_cmd_stream(&cs, radv_queue_ring(queue));
+
+   cs.b = ws->cs_create(ws, cs.hw_ip, false);
   if (!cs.b)
      return false;

@ -585,7 +587,9 @@ radv_end_sqtt(struct radv_queue *queue)
      device->sqtt.stop_cs[family] = NULL;
   }

-   cs.b = ws->cs_create(ws, radv_queue_ring(queue), false);
+   radv_init_cmd_stream(&cs, radv_queue_ring(queue));
+
+   cs.b = ws->cs_create(ws, cs.hw_ip, false);
   if (!cs.b)
      return false;

--- a/src/amd/vulkan/radv_video.c
+++ b/src/amd/vulkan/radv_video.c
@ -149,10 +149,16 @@ radv_vcn_write_memory(struct radv_cmd_buffer *cmd_buffer, uint64_t va, unsigned
   struct radv_physical_device *pdev = radv_device_physical(device);
   struct rvcn_sq_var sq;
   struct radv_cmd_stream *cs = cmd_buffer->cs;
+   enum radv_video_write_memory_support support = radv_video_write_memory_supported(pdev);

-   if (!radv_video_write_memory_supported(pdev))
+   if (support == RADV_VIDEO_WRITE_MEMORY_SUPPORT_NONE)
      return;

+   if (support == RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS) {
+      fprintf(stderr, "radv: VCN WRITE_MEMORY requires PCIe atomics support. Expect issues "
+                      "if PCIe atomics are not enabled on current device.\n");
+   }
+
   bool separate_queue = pdev->vid_decode_ip != AMD_IP_VCN_UNIFIED;
   if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC && separate_queue && pdev->vid_dec_reg.data2) {
      radeon_check_space(device->ws, cs->b, 8);
@ -819,6 +825,32 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
   if (cap && !cap->valid)
      cap = NULL;

+   if (cap) {
+      pCapabilities->maxCodedExtent.width = cap->max_width;
+      pCapabilities->maxCodedExtent.height = cap->max_height;
+   } else {
+      switch (pVideoProfile->videoCodecOperation) {
+      case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+         pCapabilities->maxCodedExtent.width = (pdev->info.family < CHIP_TONGA) ? 2048 : 4096;
+         pCapabilities->maxCodedExtent.height = (pdev->info.family < CHIP_TONGA) ? 1152 : 4096;
+         break;
+      case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+         pCapabilities->maxCodedExtent.width =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
+         pCapabilities->maxCodedExtent.height =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
+         break;
+      case VK_VIDEO_CODEC_OPERATION_DECODE_VP9_BIT_KHR:
+         pCapabilities->maxCodedExtent.width =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
+         pCapabilities->maxCodedExtent.height =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
+         break;
+      default:
+         break;
+      }
+   }
+
   pCapabilities->flags = 0;
   pCapabilities->pictureAccessGranularity.width = VK_VIDEO_H264_MACROBLOCK_WIDTH;
   pCapabilities->pictureAccessGranularity.height = VK_VIDEO_H264_MACROBLOCK_HEIGHT;
@ -1126,32 +1158,6 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
      break;
   }

-   if (cap) {
-      pCapabilities->maxCodedExtent.width = cap->max_width;
-      pCapabilities->maxCodedExtent.height = cap->max_height;
-   } else {
-      switch (pVideoProfile->videoCodecOperation) {
-      case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
-         pCapabilities->maxCodedExtent.width = (pdev->info.family < CHIP_TONGA) ? 2048 : 4096;
-         pCapabilities->maxCodedExtent.height = (pdev->info.family < CHIP_TONGA) ? 1152 : 4096;
-         break;
-      case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
-         pCapabilities->maxCodedExtent.width =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
-         pCapabilities->maxCodedExtent.height =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
-         break;
-      case VK_VIDEO_CODEC_OPERATION_DECODE_VP9_BIT_KHR:
-         pCapabilities->maxCodedExtent.width =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
-         pCapabilities->maxCodedExtent.height =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
-         break;
-      default:
-         break;
-      }
-   }
-
   return VK_SUCCESS;
 }

--- a/src/amd/vulkan/radv_video.h
+++ b/src/amd/vulkan/radv_video.h
@ -73,6 +73,19 @@ struct radv_video_session {
   bool session_initialized;
 };

+/**
+ *  WRITE_MEMORY support in FW.
+ *
+ *  none: Not supported at all. Old VCN FW and all UVD.
+ *  pcie_atomics: Supported, relies on PCIe atomics.
+ *  full: Supported, works also without PCIe atomics.
+ */
+enum radv_video_write_memory_support {
+   RADV_VIDEO_WRITE_MEMORY_SUPPORT_NONE = 0,
+   RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS,
+   RADV_VIDEO_WRITE_MEMORY_SUPPORT_FULL,
+};
+
 VK_DEFINE_NONDISP_HANDLE_CASTS(radv_video_session, vk.base, VkVideoSessionKHR, VK_OBJECT_TYPE_VIDEO_SESSION_KHR)

 void radv_init_physical_device_decoder(struct radv_physical_device *pdev);
@ -98,7 +111,7 @@ void radv_video_get_enc_dpb_image(struct radv_device *device, const struct VkVid
 bool radv_video_decode_vp9_supported(const struct radv_physical_device *pdev);
 bool radv_video_encode_av1_supported(const struct radv_physical_device *pdev);
 bool radv_video_encode_qp_map_supported(const struct radv_physical_device *pdev);
-bool radv_video_write_memory_supported(const struct radv_physical_device *pdev);
+enum radv_video_write_memory_support radv_video_write_memory_supported(const struct radv_physical_device *pdev);
 uint32_t radv_video_get_qp_map_texel_size(VkVideoCodecOperationFlagBitsKHR codec);
 bool radv_check_vcn_fw_version(const struct radv_physical_device *pdev, uint32_t dec, uint32_t enc, uint32_t rev);

--- a/src/amd/vulkan/radv_video_enc.c
+++ b/src/amd/vulkan/radv_video_enc.c
@ -890,7 +890,6 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
   uint32_t num_bits[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0};
   const struct VkVideoEncodeH264PictureInfoKHR *h264_picture_info =
      vk_find_struct_const(enc_info->pNext, VIDEO_ENCODE_H264_PICTURE_INFO_KHR);
-   int slice_count = h264_picture_info->naluSliceEntryCount;
   const StdVideoEncodeH264PictureInfo *pic = h264_picture_info->pStdPictureInfo;
   const StdVideoH264SequenceParameterSet *sps =
      vk_video_find_h264_enc_std_sps(cmd_buffer->video.params, pic->seq_parameter_set_id);
@ -903,8 +902,6 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
   unsigned int cdw_filled = 0;
   unsigned int bits_copied = 0;

-   assert(slice_count <= 1);
-
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
   const struct radv_physical_device *pdev = radv_device_physical(device);
   struct radv_cmd_stream *cs = cmd_buffer->cs;
@ -2861,7 +2858,8 @@ radv_vcn_encode_video(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf

   if (pdev->enc_hw_ver >= RADV_VIDEO_ENC_HW_2) {
      radv_vcn_sq_tail(cs, &cmd_buffer->video.sq);
-      radv_vcn_write_memory(cmd_buffer, feedback_query_va + RADV_ENC_FEEDBACK_STATUS_IDX * sizeof(uint32_t), 1);
+      if (radv_video_write_memory_supported(pdev) == RADV_VIDEO_WRITE_MEMORY_SUPPORT_FULL)
+         radv_vcn_write_memory(cmd_buffer, feedback_query_va + RADV_ENC_FEEDBACK_STATUS_IDX * sizeof(uint32_t), 1);
   }
 }

@ -3166,6 +3164,36 @@ radv_video_patch_encode_session_parameters(struct radv_device *device, struct vk
      }
      break;
   case VK_VIDEO_CODEC_OPERATION_ENCODE_H265_BIT_KHR: {
+      for (unsigned i = 0; i < params->h265_enc.h265_sps_count; i++) {
+         uint32_t pic_width_in_luma_samples =
+             params->h265_enc.h265_sps[i].base.pic_width_in_luma_samples;
+         uint32_t pic_height_in_luma_samples =
+             params->h265_enc.h265_sps[i].base.pic_height_in_luma_samples;
+         uint32_t aligned_pic_width = align(pic_width_in_luma_samples, 64);
+         uint32_t aligned_pic_height = align(pic_height_in_luma_samples, 16);
+
+         /* Override the unaligned pic_{width,height} and make up for it with conformance window
+          * cropping */
+         params->h265_enc.h265_sps[i].base.pic_width_in_luma_samples = aligned_pic_width;
+         params->h265_enc.h265_sps[i].base.pic_height_in_luma_samples = aligned_pic_height;
+
+         if (aligned_pic_width != pic_width_in_luma_samples ||
+             aligned_pic_height != pic_height_in_luma_samples) {
+            params->h265_enc.h265_sps[i].base.flags.conformance_window_flag = 1;
+            params->h265_enc.h265_sps[i].base.conf_win_right_offset +=
+               (aligned_pic_width - pic_width_in_luma_samples) / 2;
+            params->h265_enc.h265_sps[i].base.conf_win_bottom_offset +=
+               (aligned_pic_height - pic_height_in_luma_samples) / 2;
+         }
+
+         /* VCN supports only the following block sizes (resulting in 64x64 CTBs with any coding
+          * block size) */
+         params->h265_enc.h265_sps[i].base.log2_min_luma_coding_block_size_minus3 = 0;
+         params->h265_enc.h265_sps[i].base.log2_diff_max_min_luma_coding_block_size = 3;
+         params->h265_enc.h265_sps[i].base.log2_min_luma_transform_block_size_minus2 = 0;
+         params->h265_enc.h265_sps[i].base.log2_diff_max_min_luma_transform_block_size = 3;
+      }
+
      for (unsigned i = 0; i < params->h265_enc.h265_pps_count; i++) {
         /* cu_qp_delta needs to be enabled if rate control is enabled. VCN2 and newer can also enable
          * it with rate control disabled. Since we don't know what rate control will be used, we
@ -3268,6 +3296,14 @@ radv_GetEncodedVideoSessionParametersKHR(VkDevice device,
         assert(sps);
         char *data_ptr = pData ? (char *)pData + vps_size : NULL;
         vk_video_encode_h265_sps(sps, size_limit, &sps_size, data_ptr);
+
+         if (pFeedbackInfo) {
+            struct VkVideoEncodeH265SessionParametersFeedbackInfoKHR *h265_feedback_info =
+               vk_find_struct(pFeedbackInfo->pNext, VIDEO_ENCODE_H265_SESSION_PARAMETERS_FEEDBACK_INFO_KHR);
+            pFeedbackInfo->hasOverrides = VK_TRUE;
+            if (h265_feedback_info)
+               h265_feedback_info->hasStdSPSOverrides = VK_TRUE;
+         }
      }
      if (h265_get_info->writeStdPPS) {
         const StdVideoH265PictureParameterSet *pps = vk_video_find_h265_enc_std_pps(templ, h265_get_info->stdPPSId);
@ -3421,17 +3457,20 @@ radv_video_encode_qp_map_supported(const struct radv_physical_device *pdev)
   return true;
 }

-bool
+enum radv_video_write_memory_support
 radv_video_write_memory_supported(const struct radv_physical_device *pdev)
 {
-   if (pdev->info.vcn_ip_version >= VCN_5_0_0)
-      return true;
-   else if (pdev->info.vcn_ip_version >= VCN_4_0_0)
-      return pdev->info.vcn_enc_minor_version >= 22;
-   else if (pdev->info.vcn_ip_version >= VCN_3_0_0)
-      return pdev->info.vcn_enc_minor_version >= 33;
-   else if (pdev->info.vcn_ip_version >= VCN_2_0_0)
-      return pdev->info.vcn_enc_minor_version >= 24;
-   else /* VCN 1 and UVD */
-      return false;
+   if (pdev->info.vcn_ip_version >= VCN_5_0_0) {
+      return RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS;
+   } else if (pdev->info.vcn_ip_version >= VCN_4_0_0) {
+      if (pdev->info.vcn_enc_minor_version >= 22)
+         return RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS;
+   } else if (pdev->info.vcn_ip_version >= VCN_3_0_0) {
+      if (pdev->info.vcn_enc_minor_version >= 33)
+         return RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS;
+   } else if (pdev->info.vcn_ip_version >= VCN_2_0_0) {
+      if (pdev->info.vcn_enc_minor_version >= 24)
+         return RADV_VIDEO_WRITE_MEMORY_SUPPORT_PCIE_ATOMICS;
+   }
+   return RADV_VIDEO_WRITE_MEMORY_SUPPORT_NONE;
 }
--- a/src/asahi/ci/asahi-g13g-fails.txt
+++ b/src/asahi/ci/asahi-g13g-fails.txt
--- a/src/asahi/layout/layout.c
+++ b/src/asahi/layout/layout.c
@ -24,7 +24,8 @@ ail_initialize_linear(struct ail_layout *layout)
   layout->layer_stride_B = align64(
      (uint64_t)layout->linear_stride_B * layout->height_px, AIL_CACHELINE);

-   layout->size_B = layout->layer_stride_B * layout->depth_px;
+   layout->size_B =
+      layout->level_offsets_B[0] + (layout->layer_stride_B * layout->depth_px);
 }

 /*
@ -341,6 +342,7 @@ ail_make_miptree(struct ail_layout *layout)
      assert(layout->linear_stride_B == 0 && "Invalid nonlinear layout");
      assert(layout->levels >= 1 && "Invalid dimensions");
      assert(layout->sample_count_sa >= 1 && "Invalid sample count");
+      assert(layout->level_offsets_B[0] == 0 && "Invalid offset");
   }

   assert(!(layout->writeable_image && layout->compressed) &&
--- a/src/asahi/lib/agx_device_virtio.c
+++ b/src/asahi/lib/agx_device_virtio.c
@ -133,6 +133,7 @@ agx_virtio_bo_bind(struct agx_device *dev, struct drm_asahi_gem_bind_op *ops,
   memcpy(req->payload, ops, payload_size);

   int ret = vdrm_send_req(dev->vdrm, &req->hdr, false);
+   free(req);
   if (ret) {
      fprintf(stderr, "ASAHI_CCMD_GEM_BIND failed: %d\n", ret);
   }
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@ -992,28 +992,34 @@ hk_CmdEndRendering(VkCommandBuffer commandBuffer)
   }
 }

+static void
+hk_init_heap(const void *data) {
+   struct hk_cmd_buffer *cmd = (struct hk_cmd_buffer *) data;
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   perf_debug(cmd, "Allocating heap");
+
+   size_t size = 128 * 1024 * 1024;
+   dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
+
+   /* The geometry state buffer is initialized here and then is treated by
+    * the CPU as rodata, even though the GPU uses it for scratch internally.
+    */
+   off_t off = dev->rodata.heap - dev->rodata.bo->va->addr;
+   struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off;
+
+   *map = (struct agx_heap){
+      .base = dev->heap->va->addr,
+      .size = size,
+   };
+}
+
 static uint64_t
 hk_heap(struct hk_cmd_buffer *cmd)
 {
   struct hk_device *dev = hk_cmd_buffer_device(cmd);

-   if (unlikely(!dev->heap)) {
-      perf_debug(cmd, "Allocating heap");
-
-      size_t size = 128 * 1024 * 1024;
-      dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
-
-      /* The geometry state buffer is initialized here and then is treated by
-       * the CPU as rodata, even though the GPU uses it for scratch internally.
-       */
-      off_t off = dev->rodata.heap - dev->rodata.bo->va->addr;
-      struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off;
-
-      *map = (struct agx_heap){
-         .base = dev->heap->va->addr,
-         .size = size,
-      };
-   }
+   util_call_once_data(&dev->heap_init_once, hk_init_heap, cmd);

   /* We need to free all allocations after each command buffer execution */
   if (!cmd->uses_heap) {
--- a/src/asahi/vulkan/hk_descriptor_set_layout.c
+++ b/src/asahi/vulkan/hk_descriptor_set_layout.c
@ -330,6 +330,7 @@ hk_GetDescriptorSetLayoutSupport(
   uint64_t non_variable_size = 0;
   uint32_t variable_stride = 0;
   uint32_t variable_count = 0;
+   bool variable_is_inline_uniform_block = false;
   uint8_t dynamic_buffer_count = 0;

   for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) {
@ -362,6 +363,10 @@ hk_GetDescriptorSetLayoutSupport(
             */
            variable_count = MAX2(1, binding->descriptorCount);
            variable_stride = stride;
+
+            variable_is_inline_uniform_block =
+               binding->descriptorType ==
+               VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
         } else {
            /* Since we're aligning to the maximum and since this is just a
             * check for whether or not the max buffer size is big enough, we
@ -393,12 +398,21 @@ hk_GetDescriptorSetLayoutSupport(
      switch (ext->sType) {
      case VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT: {
         VkDescriptorSetVariableDescriptorCountLayoutSupport *vs = (void *)ext;
+         uint32_t max_var_count;
+
         if (variable_stride > 0) {
-            vs->maxVariableDescriptorCount =
+            max_var_count =
               (max_buffer_size - non_variable_size) / variable_stride;
         } else {
-            vs->maxVariableDescriptorCount = 0;
+            max_var_count = 0;
         }
+
+         if (variable_is_inline_uniform_block) {
+            max_var_count =
+               MIN2(max_var_count, HK_MAX_INLINE_UNIFORM_BLOCK_SIZE);
+         }
+
+         vs->maxVariableDescriptorCount = max_var_count;
         break;
      }

--- a/src/asahi/vulkan/hk_device.h
+++ b/src/asahi/vulkan/hk_device.h
@ -92,6 +92,7 @@ struct hk_device {
    * expected to be a legitimate problem. If it is, we can rework later.
    */
   struct agx_bo *heap;
+   util_once_flag heap_init_once;

   struct {
      struct agx_scratch vs, fs, cs;
--- a/src/asahi/vulkan/hk_format.c
+++ b/src/asahi/vulkan/hk_format.c
@ -67,7 +67,7 @@ get_drm_format_modifier_properties_list(
      {
         *out_props = (VkDrmFormatModifierPropertiesEXT){
            .drmFormatModifier = mod,
-            .drmFormatModifierPlaneCount = 1 /* no planar mods */,
+            .drmFormatModifierPlaneCount = vk_format_get_plane_count(vk_format),
            .drmFormatModifierTilingFeatures = flags,
         };
      };
@ -96,7 +96,7 @@ get_drm_format_modifier_properties_list_2(
      {
         *out_props = (VkDrmFormatModifierProperties2EXT){
            .drmFormatModifier = mod,
-            .drmFormatModifierPlaneCount = 1, /* no planar mods */
+            .drmFormatModifierPlaneCount = vk_format_get_plane_count(vk_format),
            .drmFormatModifierTilingFeatures = flags,
         };
      };
--- a/src/asahi/vulkan/hk_image.c
+++ b/src/asahi/vulkan/hk_image.c
@ -1424,6 +1424,13 @@ hk_copy_memory_to_image(struct hk_device *device, struct hk_image *dst_image,
   uint32_t src_height = info->memoryImageHeight ?: extent.height;

   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+
+   /* Align width and height to block */
+   src_width =
+      DIV_ROUND_UP(src_width, util_format_get_blockwidth(layout->format));
+   src_height =
+      DIV_ROUND_UP(src_height, util_format_get_blockheight(layout->format));
+
   uint32_t src_pitch = src_width * blocksize_B;

   unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D)
@ -1496,6 +1503,13 @@ hk_copy_image_to_memory(struct hk_device *device, struct hk_image *src_image,
 #endif

   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+
+   /* Align width and height to block */
+   dst_width =
+      DIV_ROUND_UP(dst_width, util_format_get_blockwidth(layout->format));
+   dst_height =
+      DIV_ROUND_UP(dst_height, util_format_get_blockheight(layout->format));
+
   uint32_t dst_pitch = dst_width * blocksize_B;

   unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D)
@ -1649,11 +1663,6 @@ hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
            &device->physical_device->ubwc_config);
 #endif
      } else {
-         /* Work tile-by-tile, holding the unswizzled tile in a temporary
-          * buffer.
-          */
-         char temp_tile[16384];
-
         unsigned src_level = info->srcSubresource.mipLevel;
         unsigned dst_level = info->dstSubresource.mipLevel;
         uint32_t block_width = src_layout->tilesize_el[src_level].width_el;
@ -1667,6 +1676,12 @@ hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
         }

         uint32_t temp_pitch = block_width * src_block_B;
+         size_t temp_tile_size = temp_pitch * (src_offset.y + extent.height);
+
+         /* Work tile-by-tile, holding the unswizzled tile in a temporary
+          * buffer.
+          */
+         char *temp_tile = malloc(temp_tile_size);

         for (unsigned by = src_offset.y / block_height;
              by * block_height < src_offset.y + extent.height; by++) {
@ -1683,14 +1698,14 @@ hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
                  MIN2((bx + 1) * block_width, src_offset.x + extent.width) -
                  src_x_start;

-               assert(height * temp_pitch <= ARRAY_SIZE(temp_tile));
-
               ail_detile((void *)src, temp_tile, src_layout, src_level,
                          temp_pitch, src_x_start, src_y_start, width, height);
               ail_tile(dst, temp_tile, dst_layout, dst_level, temp_pitch,
                        dst_x_start, dst_y_start, width, height);
            }
         }
+
+         free(temp_tile);
      }
   }
 }
--- a/src/asahi/vulkan/hk_physical_device.c
+++ b/src/asahi/vulkan/hk_physical_device.c
@ -859,7 +859,7 @@ hk_get_device_properties(const struct agx_device *dev,
      .maxSubgroupSize = 32,
      .maxComputeWorkgroupSubgroups = 1024 / 32,
      .requiredSubgroupSizeStages = 0,
-      .maxInlineUniformBlockSize = 1 << 16,
+      .maxInlineUniformBlockSize = HK_MAX_INLINE_UNIFORM_BLOCK_SIZE,
      .maxPerStageDescriptorInlineUniformBlocks = 32,
      .maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 32,
      .maxDescriptorSetInlineUniformBlocks = 6 * 32,
@ -953,7 +953,7 @@ hk_get_device_properties(const struct agx_device *dev,
      .robustUniformBufferAccessSizeAlignment = HK_MIN_UBO_ALIGNMENT,

      /* VK_EXT_sample_locations */
-      .sampleLocationSampleCounts = sample_counts,
+      .sampleLocationSampleCounts = sample_counts & ~VK_SAMPLE_COUNT_1_BIT,
      .maxSampleLocationGridSize = (VkExtent2D){1, 1},
      .sampleLocationCoordinateRange[0] = 0.0f,
      .sampleLocationCoordinateRange[1] = 0.9375f,
--- a/src/asahi/vulkan/hk_private.h
+++ b/src/asahi/vulkan/hk_private.h
@ -12,18 +12,19 @@
 #include "vk_log.h"
 #include "vk_util.h"

-#define HK_MAX_SETS                   8
-#define HK_MAX_PUSH_SIZE              256
-#define HK_MAX_DYNAMIC_BUFFERS        64
-#define HK_MAX_RTS                    8
-#define HK_MIN_SSBO_ALIGNMENT         16
-#define HK_MIN_TEXEL_BUFFER_ALIGNMENT 16
-#define HK_MIN_UBO_ALIGNMENT          64
-#define HK_MAX_VIEWPORTS              16
-#define HK_MAX_DESCRIPTOR_SIZE        64
-#define HK_MAX_PUSH_DESCRIPTORS       32
-#define HK_MAX_DESCRIPTOR_SET_SIZE    (1u << 30)
-#define HK_MAX_DESCRIPTORS            (1 << 20)
+#define HK_MAX_SETS                      8
+#define HK_MAX_PUSH_SIZE                 256
+#define HK_MAX_DYNAMIC_BUFFERS           64
+#define HK_MAX_RTS                       8
+#define HK_MIN_SSBO_ALIGNMENT            16
+#define HK_MIN_TEXEL_BUFFER_ALIGNMENT    16
+#define HK_MIN_UBO_ALIGNMENT             64
+#define HK_MAX_VIEWPORTS                 16
+#define HK_MAX_DESCRIPTOR_SIZE           64
+#define HK_MAX_PUSH_DESCRIPTORS          32
+#define HK_MAX_DESCRIPTOR_SET_SIZE       (1u << 30)
+#define HK_MAX_INLINE_UNIFORM_BLOCK_SIZE (1u << 16)
+#define HK_MAX_DESCRIPTORS               (1 << 20)
 #define HK_PUSH_DESCRIPTOR_SET_SIZE                                            \
   (HK_MAX_PUSH_DESCRIPTORS * HK_MAX_DESCRIPTOR_SIZE)
 #define HK_SSBO_BOUNDS_CHECK_ALIGNMENT 4
--- a/src/asahi/vulkan/hk_queue.c
+++ b/src/asahi/vulkan/hk_queue.c
@ -812,11 +812,6 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue,
   /* Now setup the command structs */
   struct util_dynarray payload;
   util_dynarray_init(&payload, NULL);
-   union drm_asahi_cmd *cmds = malloc(sizeof(*cmds) * command_count);
-   if (cmds == NULL) {
-      free(cmds);
-      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
-   }

   unsigned nr_vdm = 0, nr_cdm = 0;

--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -319,14 +319,10 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_load_base_global_invocation_id:
   case nir_intrinsic_load_base_workgroup_id:
   case nir_intrinsic_load_alpha_reference_amd:
-   case nir_intrinsic_load_ubo_uniform_block_intel:
-   case nir_intrinsic_load_ssbo_uniform_block_intel:
-   case nir_intrinsic_load_shared_uniform_block_intel:
   case nir_intrinsic_load_barycentric_optimize_amd:
   case nir_intrinsic_load_poly_line_smooth_enabled:
   case nir_intrinsic_load_rasterization_primitive_amd:
   case nir_intrinsic_unit_test_uniform_amd:
-   case nir_intrinsic_load_global_constant_uniform_block_intel:
   case nir_intrinsic_load_debug_log_desc_amd:
   case nir_intrinsic_load_xfb_state_address_gfx12_amd:
   case nir_intrinsic_cmat_length:
@ -364,6 +360,24 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
      is_divergent = false;
      break;

+   case nir_intrinsic_load_ubo_uniform_block_intel:
+   case nir_intrinsic_load_ssbo_uniform_block_intel:
+   case nir_intrinsic_load_shared_uniform_block_intel:
+   case nir_intrinsic_load_global_constant_uniform_block_intel:
+      if (options & (nir_divergence_across_subgroups |
+                     nir_divergence_multiple_workgroup_per_compute_subgroup)) {
+         unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
+         for (unsigned i = 0; i < num_srcs; i++) {
+            if (src_divergent(instr->src[i], state)) {
+               is_divergent = true;
+               break;
+            }
+         }
+      } else {
+         is_divergent = false;
+      }
+      break;
+
   /* This is divergent because it specifically loads sequential values into
    * successive SIMD lanes.
    */
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@ -1069,6 +1069,7 @@ nir_get_io_index_src_number(const nir_intrinsic_instr *instr)
   IMG_CASE(atomic):
   IMG_CASE(atomic_swap):
   IMG_CASE(size):
+   IMG_CASE(levels):
   IMG_CASE(samples):
   IMG_CASE(texel_address):
   IMG_CASE(samples_identical):
--- a/src/compiler/nir/nir_lower_shader_calls.c
+++ b/src/compiler/nir/nir_lower_shader_calls.c
@ -1228,8 +1228,16 @@ wrap_instr(nir_builder *b, nir_instr *instr, void *data)
 static bool
 wrap_instrs(nir_shader *shader, wrap_instr_callback callback)
 {
-   return nir_shader_instructions_pass(shader, wrap_instr,
-                                       nir_metadata_none, callback);
+   bool progress = nir_shader_instructions_pass(shader, wrap_instr,
+                                                nir_metadata_none, callback);
+   /* Wrapping jump instructions that are located inside ifs can break SSA
+    * invariants because the else block no longer dominates the merge block.
+    * Repair the SSA to make the validator happy again.
+    */
+   if (progress)
+      nir_repair_ssa(shader);
+
+   return progress;
 }

 static bool
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -4096,9 +4096,9 @@ distribute_src_mods = [
   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),

-   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
+   (('fneg', ('ffma(is_used_once,nsz)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
-   (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
+   (('fneg', ('fadd(is_used_once,nsz)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),

   # Note that fmin <-> fmax.  I don't think there is a way to distribute
   # fabs() into fmin or fmax.
--- a/src/compiler/nir/nir_opt_shrink_stores.c
+++ b/src/compiler/nir/nir_opt_shrink_stores.c
@ -82,7 +82,9 @@ opt_shrink_store_instr(nir_builder *b, nir_intrinsic_instr *instr, bool shrink_i

   /* Trim the num_components stored according to the write mask. */
   unsigned write_mask = nir_intrinsic_write_mask(instr);
-   unsigned last_bit = util_last_bit(write_mask);
+   /* Don't trim down to an invalid number of components, though. */
+   unsigned last_bit = nir_round_up_components(util_last_bit(write_mask));
+
   if (last_bit < instr->num_components) {
      nir_def *def = nir_trim_vector(b, instr->src[0].ssa, last_bit);
      nir_src_rewrite(&instr->src[0], def);
--- a/src/compiler/nir/nir_precompiled.h
+++ b/src/compiler/nir/nir_precompiled.h
@ -652,6 +652,7 @@ nir_precompiled_build_variant(const nir_function *libfunc,

   assert(libfunc->workgroup_size[0] != 0 && "must set workgroup size");

+   b.shader->info.workgroup_size_variable = false;
   b.shader->info.workgroup_size[0] = libfunc->workgroup_size[0];
   b.shader->info.workgroup_size[1] = libfunc->workgroup_size[1];
   b.shader->info.workgroup_size[2] = libfunc->workgroup_size[2];
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@ -506,8 +506,8 @@ vtn_pointer_dereference(struct vtn_builder *b,
            type = type->array_element;
         }
         tail = nir_build_deref_array(&b->nb, tail, arr_index);
+         tail->arr.in_bounds = deref_chain->in_bounds;
      }
-      tail->arr.in_bounds = deref_chain->in_bounds;

      access |= type->access;
   }
--- a/src/freedreno/fdl/fd6_tiled_memcpy.cc
+++ b/src/freedreno/fdl/fd6_tiled_memcpy.cc
@ -564,7 +564,7 @@ tiled_to_linear_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
            "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
   }
 #else
-   memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
+   memcpy_small<2, TILED_TO_LINEAR, FDL_MACROTILE_4_CHANNEL>(
      0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, 0, 0);
 #endif
 }
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@ -2300,6 +2300,17 @@ insert_live_out_moves(struct ra_ctx *ctx)
   insert_file_live_out_moves(ctx, &ctx->shared);
 }

+static bool
+has_merge_set_preferred_reg(struct ir3_register *reg)
+{
+   assert(reg->merge_set);
+   assert(reg->num != INVALID_REG);
+
+   return reg->merge_set->preferred_reg != (physreg_t)~0 &&
+          ra_reg_get_physreg(reg) ==
+             reg->merge_set->preferred_reg + reg->merge_set_offset;
+}
+
 static void
 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
 {
@ -2338,17 +2349,15 @@ handle_block(struct ra_ctx *ctx, struct ir3_block *block)
         struct ir3_register *dst = input->dsts[0];
         assert(dst->num != INVALID_REG);

-         physreg_t dst_start = ra_reg_get_physreg(dst);
         physreg_t dst_end;

-         if (dst->merge_set) {
+         if (dst->merge_set && has_merge_set_preferred_reg(dst)) {
            /* Take the whole merge set into account to prevent its range being
             * allocated for defs not part of the merge set.
             */
-            assert(dst_start >= dst->merge_set_offset);
-            dst_end = dst_start - dst->merge_set_offset + dst->merge_set->size;
+            dst_end = dst->merge_set->preferred_reg + dst->merge_set->size;
         } else {
-            dst_end = dst_start + reg_size(dst);
+            dst_end = ra_reg_get_physreg(dst) + reg_size(dst);
         }

         struct ra_file *file = ra_get_file(ctx, dst);
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@ -1461,6 +1461,15 @@ r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
   }

+   /* On a7xx we must always use FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8. See
+    * blit_base_format().
+    */
+   if (CHIP >= A7XX && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
+      RB_MRT_BUF_INFO = pkt_field_set(A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT,
+                                      RB_MRT_BUF_INFO,
+                                      FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
+   }
+
   tu_cs_emit_regs(cs,
                   RB_MRT_BUF_INFO(CHIP, 0, .dword = RB_MRT_BUF_INFO),
                   A6XX_RB_MRT_PITCH(0, 0),
@ -1533,7 +1542,8 @@ r3d_setup(struct tu_cmd_buffer *cmd,
      tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
   }

-   enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
+   enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, 
+                                                 blit_param & R3D_DST_GMEM);
   fixup_dst_format(src_format, &dst_format, &fmt);

   if (!cmd->state.pass) {
@ -4638,7 +4648,7 @@ clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
   enum pipe_format format = vk_format_to_pipe_format(vk_format);
   const struct tu_framebuffer *fb = cmd->state.framebuffer;
   const struct tu_image_view *iview = cmd->state.attachments[a];
-   const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
+   const uint32_t clear_views = cmd->state.pass->attachments[a].used_views;
   const struct blit_ops *ops = &r2d_ops<CHIP>;
   const VkClearValue *value = &cmd->state.clear_values[a];
   if (cmd->state.pass->attachments[a].samples > 1)
@ -4734,7 +4744,7 @@ tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,

   tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, 0,
                                 cmd->state.framebuffer->layers,
-                                 attachment->clear_views,
+                                 attachment->used_views,
                                 attachment->clear_mask,
                                 &cmd->state.clear_values[a], NULL);
 }
@ -4755,7 +4765,7 @@ tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd,
                             iview->view.ubwc_enabled, att->samples);

   enum pipe_format format = vk_format_to_pipe_format(att->format);
-   for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
+   for_each_layer(i, att->used_views, cmd->state.framebuffer->layers) {
      uint32_t layer = i + 0;
      uint32_t mask =
         aspect_write_mask_generic_clear(format, att->clear_mask);
@ -4836,7 +4846,7 @@ tu_emit_blit(struct tu_cmd_buffer *cmd,
   uint32_t buffer_id = tu_resolve_group_include_buffer<CHIP>(resolve_group, format);
   event_blit_setup(cs, buffer_id, attachment, blit_event_type, clear_mask);

-   for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
+   for_each_layer(i, attachment->used_views, cmd->state.framebuffer->layers) {
      event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i);
      event_blit_run<CHIP>(cmd, cs, attachment, &blt_view, separate_stencil);
   }
@ -4951,7 +4961,7 @@ load_3d_blit(struct tu_cmd_buffer *cmd,
   /* Wait for CACHE_INVALIDATE to land */
   tu_cs_emit_wfi(cs);

-   for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
+   for_each_layer(i, att->used_views, cmd->state.framebuffer->layers) {
      if (cmd->state.pass->has_fdm) {
         struct apply_load_coords_state state = {
            .view = i,
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@ -1616,7 +1616,7 @@ tu6_emit_gmem_stores(struct tu_cmd_buffer *cmd,
            scissor_emitted = true;
         }
         tu_store_gmem_attachment<CHIP>(cmd, cs, resolve_group, a, a,
-                                  fb->layers, subpass->multiview_mask,
+                                  fb->layers, att->used_views,
                                  cond_exec_allowed);
      }
   }
--- a/src/freedreno/vulkan/tu_descriptor_set.cc
+++ b/src/freedreno/vulkan/tu_descriptor_set.cc
@ -208,8 +208,8 @@ tu_CreateDescriptorSetLayout(
      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
         set_layout->has_inline_uniforms = true;

-      if (variable_flags && binding->binding < variable_flags->bindingCount &&
-          (variable_flags->pBindingFlags[binding->binding] &
+      if (variable_flags && j < variable_flags->bindingCount &&
+          (variable_flags->pBindingFlags[j] &
           VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
         assert(!binding->pImmutableSamplers); /* Terribly ill defined  how
                                                  many samplers are valid */
@ -377,7 +377,7 @@ tu_GetDescriptorSetLayoutSupport(
      uint64_t max_count = MAX_SET_SIZE;
      unsigned descriptor_count = binding->descriptorCount;
      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
-         max_count = MAX_SET_SIZE - size;
+         max_count = MAX_INLINE_UBO_RANGE - size;
         descriptor_count = descriptor_sz;
         descriptor_sz = 1;
      } else if (descriptor_sz) {
@ -388,9 +388,9 @@ tu_GetDescriptorSetLayoutSupport(
         supported = false;
      }

-      if (variable_flags && binding->binding < variable_flags->bindingCount &&
+      if (variable_flags && i < variable_flags->bindingCount &&
          variable_count &&
-          (variable_flags->pBindingFlags[binding->binding] &
+          (variable_flags->pBindingFlags[i] &
           VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
         variable_count->maxVariableDescriptorCount =
            MIN2(UINT32_MAX, max_count);
--- a/src/freedreno/vulkan/tu_pass.cc
+++ b/src/freedreno/vulkan/tu_pass.cc
@ -417,7 +417,8 @@ tu_render_pass_patch_input_gmem(struct tu_render_pass *pass)
         uint32_t a = subpass->input_attachments[j].attachment;
         if (a == VK_ATTACHMENT_UNUSED)
            continue;
-         subpass->input_attachments[j].patch_input_gmem = written[a];
+         subpass->input_attachments[j].patch_input_gmem =
+            written[a] && pass->attachments[a].gmem;
      }

      for (unsigned j = 0; j < subpass->color_count; j++) {
@ -884,7 +885,7 @@ tu_subpass_use_attachment(struct tu_render_pass *pass, int i, uint32_t a, const

   att->gmem = true;
   update_samples(subpass, pCreateInfo->pAttachments[a].samples);
-   att->clear_views |= subpass->multiview_mask;
+   att->used_views |= subpass->multiview_mask;

   /* Loads and clears are emitted at the start of the subpass that needs them. */
   att->first_subpass_idx = MIN2(i, att->first_subpass_idx);
@ -1126,6 +1127,7 @@ tu_CreateRenderPass2(VkDevice _device,
      if (!att->gmem) {
         att->clear_mask = 0;
         att->load = false;
+         att->load_stencil = false;
      }
   }

@ -1235,7 +1237,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
      VK_FROM_HANDLE(tu_image_view, view, att_info->imageView);
      tu_setup_dynamic_attachment(att, view);
      att->gmem = true;
-      att->clear_views = info->viewMask;
+      att->used_views = info->viewMask;
      attachment_set_ops(device, att, att_info->loadOp,
                         VK_ATTACHMENT_LOAD_OP_DONT_CARE, att_info->storeOp,
                         VK_ATTACHMENT_STORE_OP_DONT_CARE);
@ -1279,7 +1281,7 @@ tu_setup_dynamic_render_pass(struct tu_cmd_buffer *cmd_buffer,
         struct tu_render_pass_attachment *att = &pass->attachments[a];
         tu_setup_dynamic_attachment(att, view);
         att->gmem = true;
-         att->clear_views = info->viewMask;
+         att->used_views = info->viewMask;
         subpass->depth_stencil_attachment.attachment = a++;
         subpass->input_attachments[0].attachment =
            subpass->depth_stencil_attachment.attachment;
--- a/src/freedreno/vulkan/tu_pass.h
+++ b/src/freedreno/vulkan/tu_pass.h
@ -94,7 +94,19 @@ struct tu_render_pass_attachment
   VkSampleCountFlagBits samples;
   uint32_t cpp;
   VkImageAspectFlags clear_mask;
-   uint32_t clear_views;
+
+   /* All views that are used with the attachment in all subpasses. Used to
+    * determine which views to apply loadOp/storeOp to.
+    */
+   uint32_t used_views;
+   /* The internal MSRTSS attachment to clear when the user says to clear
+    * this attachment. Clear values must be remapped to this attachment.
+    */
+   uint32_t remapped_clear_att;
+   /* For internal attachments created for MSRTSS, the original user attachment
+    * which it is resolved/unresolved to.
+    */
+   uint32_t user_att;
   bool load;
   bool store;
   bool gmem;
--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@ -3157,8 +3157,6 @@ tu6_emit_blend(struct tu_cs *cs,

   bool dual_src_blend = tu_blend_state_is_dual_src(cb);

-   tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_rts));
-   tu_cs_emit_regs(cs, A6XX_RB_PS_MRT_CNTL(.mrt = num_rts));
   tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
                                          .unk8 = true,
                                          .dual_color_in_enable =
@ -3180,10 +3178,12 @@ tu6_emit_blend(struct tu_cs *cs,
                                          .alpha_to_one = alpha_to_one_enable,
                                          .sample_mask = sample_mask));

+   unsigned num_remapped_rts = 0;
   for (unsigned i = 0; i < num_rts; i++) {
      if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
         continue;
      unsigned remapped_idx = cal->color_map[i];
+      num_remapped_rts = MAX2(num_remapped_rts, remapped_idx + 1);
      const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
      if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
         const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
@ -3227,6 +3227,8 @@ tu6_emit_blend(struct tu_cs *cs,
                            A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
      }
   }
+   tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_remapped_rts));
+   tu_cs_emit_regs(cs, A6XX_RB_PS_MRT_CNTL(.mrt = num_remapped_rts));
 }

 static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
--- a/src/gallium/auxiliary/gallivm/lp_bld.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld.h
@ -88,10 +88,9 @@
 #define LLVMCreateBuilder ILLEGAL_LLVM_FUNCTION

 typedef struct lp_context_ref {
-#if GALLIVM_USE_ORCJIT
-   LLVMOrcThreadSafeContextRef ref;
-#else
   LLVMContextRef ref;
+#if GALLIVM_USE_ORCJIT
+   LLVMOrcThreadSafeContextRef tsref;
 #endif
   bool owned;
 } lp_context_ref;
@ -101,18 +100,21 @@ lp_context_create(lp_context_ref *context)
 {
   assert(context != NULL);
 #if GALLIVM_USE_ORCJIT
-   context->ref = LLVMOrcCreateNewThreadSafeContext();
+#if LLVM_VERSION_MAJOR >= 21
+   context->ref = LLVMContextCreate();
+   /* Ownership of ref is then transferred to tsref */
+   context->tsref = LLVMOrcCreateNewThreadSafeContextFromLLVMContext(context->ref);
+#else
+   context->tsref = LLVMOrcCreateNewThreadSafeContext();
+   context->ref = LLVMOrcThreadSafeContextGetContext(context->tsref);
+#endif
 #else
   context->ref = LLVMContextCreate();
 #endif
   context->owned = true;
 #if LLVM_VERSION_MAJOR == 15
   if (context->ref) {
-#if GALLIVM_USE_ORCJIT
-      LLVMContextSetOpaquePointers(LLVMOrcThreadSafeContextGetContext(context->ref), false);
-#else
      LLVMContextSetOpaquePointers(context->ref, false);
-#endif
   }
 #endif
 }
@ -123,7 +125,7 @@ lp_context_destroy(lp_context_ref *context)
   assert(context != NULL);
   if (context->owned) {
 #if GALLIVM_USE_ORCJIT
-      LLVMOrcDisposeThreadSafeContext(context->ref);
+      LLVMOrcDisposeThreadSafeContext(context->tsref);
 #else
      LLVMContextDispose(context->ref);
 #endif
--- a/src/gallium/auxiliary/gallivm/lp_bld_init_orc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init_orc.cpp
@ -555,8 +555,8 @@ init_gallivm_state(struct gallivm_state *gallivm, const char *name,

   gallivm->cache = cache;

-   gallivm->_ts_context = context->ref;
-   gallivm->context = LLVMContextCreate();
+   gallivm->_ts_context = context->tsref;
+   gallivm->context = context->ref;

   gallivm->module_name = LPJit::get_unique_name(name);
   gallivm->module = LLVMModuleCreateWithNameInContext(gallivm->module_name,
--- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
@ -3163,7 +3163,7 @@ do_int_divide(struct lp_build_nir_soa_context *bld,

 static LLVMValueRef
 do_int_mod(struct lp_build_nir_soa_context *bld,
-           bool is_unsigned, unsigned src_bit_size,
+           bool is_unsigned, bool use_src2_sign, unsigned src_bit_size,
           LLVMValueRef src, LLVMValueRef src2)
 {
   struct gallivm_state *gallivm = bld->base.gallivm;
@ -3180,8 +3180,18 @@ do_int_mod(struct lp_build_nir_soa_context *bld,
      divisor = get_signed_divisor(gallivm, int_bld, mask_bld,
                                   src_bit_size, src, divisor);
   }
-   LLVMValueRef result = lp_build_mod(int_bld, src, divisor);
-   return LLVMBuildOr(builder, div_mask, result, "");
+   LLVMValueRef rem = lp_build_mod(int_bld, src, divisor);
+   rem = LLVMBuildOr(builder, div_mask, rem, "");
+
+   if (use_src2_sign) {
+      LLVMValueRef add_src2 = LLVMBuildICmp(builder, LLVMIntNE, rem, int_bld->zero, "");
+      LLVMValueRef signs_different = LLVMBuildXor(builder, LLVMBuildICmp(builder, LLVMIntSLT, src, int_bld->zero, ""),
+                                                  LLVMBuildICmp(builder, LLVMIntSLT, src2, int_bld->zero, ""), "");
+      add_src2 = LLVMBuildAnd(builder, add_src2, signs_different, "");
+      rem = LLVMBuildSelect(builder, add_src2, LLVMBuildAdd(builder, rem, src2, ""), rem, "");
+   }
+
+   return rem;
 }

 static LLVMValueRef
@ -3493,7 +3503,7 @@ do_alu_action(struct lp_build_nir_soa_context *bld,
      break;
   case nir_op_imod:
   case nir_op_irem:
-      result = do_int_mod(bld, false, src_bit_size[0], src[0], src[1]);
+      result = do_int_mod(bld, false, instr->op == nir_op_imod, src_bit_size[0], src[0], src[1]);
      break;
   case nir_op_ishl: {
      if (src_bit_size[0] == 64)
@ -3592,7 +3602,7 @@ do_alu_action(struct lp_build_nir_soa_context *bld,
      result = lp_build_min(uint_bld, src[0], src[1]);
      break;
   case nir_op_umod:
-      result = do_int_mod(bld, true, src_bit_size[0], src[0], src[1]);
+      result = do_int_mod(bld, true, false, src_bit_size[0], src[0], src[1]);
      break;
   case nir_op_umul_high: {
      LLVMValueRef hi_bits;
--- a/src/gallium/drivers/asahi/agx_batch.c
+++ b/src/gallium/drivers/asahi/agx_batch.c
@ -634,8 +634,8 @@ asahi_add_attachment(struct attachments *att, struct agx_resource *rsrc)
   assert(att->count < MAX_ATTACHMENTS);

   att->list[att->count++] = (struct drm_asahi_attachment){
-      .size = rsrc->layout.size_B,
-      .pointer = rsrc->bo->va->addr,
+      .size = rsrc->layout.size_B - rsrc->layout.level_offsets_B[0],
+      .pointer = agx_map_gpu(rsrc),
   };
 }

--- a/src/gallium/drivers/asahi/agx_pipe.c
+++ b/src/gallium/drivers/asahi/agx_pipe.c
@ -210,13 +210,13 @@ agx_resource_from_handle(struct pipe_screen *pscreen,

   if (rsc->layout.tiling == AIL_TILING_LINEAR) {
      rsc->layout.linear_stride_B = whandle->stride;
-   } else if (whandle->stride != ail_get_wsi_stride_B(&rsc->layout, 0)) {
+      rsc->layout.level_offsets_B[0] = whandle->offset;
+   } else if (whandle->stride != ail_get_wsi_stride_B(&rsc->layout, 0) ||
+              whandle->offset != 0) {
      FREE(rsc);
      return NULL;
   }

-   assert(whandle->offset == 0);
-
   ail_make_miptree(&rsc->layout);

   if (prsc->target == PIPE_BUFFER) {
@ -301,7 +301,8 @@ agx_resource_get_param(struct pipe_screen *pscreen, struct pipe_context *pctx,
                       enum pipe_resource_param param, unsigned usage,
                       uint64_t *value)
 {
-   struct agx_resource *rsrc = (struct agx_resource *)prsc;
+   struct agx_resource *rsrc =
+      (struct agx_resource *)util_resource_at_index(prsc, plane);

   switch (param) {
   case PIPE_RESOURCE_PARAM_STRIDE:
@ -1292,7 +1293,7 @@ agx_cmdbuf(struct agx_device *dev, struct drm_asahi_cmd_render *c,

         if (zres->layout.compressed) {
            c->depth.comp_base =
-               agx_map_texture_gpu(zres, 0) + zres->layout.metadata_offset_B +
+               agx_map_gpu(zres) + zres->layout.metadata_offset_B +
               (first_layer * zres->layout.compression_layer_stride_B) +
               zres->layout.level_offsets_compressed_B[level];

@ -1329,7 +1330,7 @@ agx_cmdbuf(struct agx_device *dev, struct drm_asahi_cmd_render *c,

         if (sres->layout.compressed) {
            c->stencil.comp_base =
-               agx_map_texture_gpu(sres, 0) + sres->layout.metadata_offset_B +
+               agx_map_gpu(sres) + sres->layout.metadata_offset_B +
               (first_layer * sres->layout.compression_layer_stride_B) +
               sres->layout.level_offsets_compressed_B[level];

--- a/src/gallium/drivers/asahi/agx_query.c
+++ b/src/gallium/drivers/asahi/agx_query.c
@ -503,7 +503,7 @@ agx_get_query_result_resource_gpu(struct agx_context *ctx,
                                                         : 0;

   libagx_copy_query_gl(batch, agx_1d(1), AGX_BARRIER_ALL, query->ptr.gpu,
-                        rsrc->bo->va->addr + offset, result_type, bool_size);
+                        agx_map_gpu(rsrc) + offset, result_type, bool_size);
   return true;
 }

--- a/src/gallium/drivers/asahi/agx_state.c
+++ b/src/gallium/drivers/asahi/agx_state.c
@ -726,7 +726,7 @@ agx_pack_texture(void *out, struct agx_resource *rsrc,

      if (rsrc->layout.compressed) {
         cfg.acceleration_buffer =
-            agx_map_texture_gpu(rsrc, 0) + rsrc->layout.metadata_offset_B +
+            agx_map_gpu(rsrc) + rsrc->layout.metadata_offset_B +
            (first_layer * rsrc->layout.compression_layer_stride_B);
      }

@ -1262,7 +1262,7 @@ agx_batch_upload_pbe(struct agx_batch *batch, struct agx_pbe_packed *out,
         cfg.extended = true;

         cfg.acceleration_buffer =
-            agx_map_texture_gpu(tex, 0) + tex->layout.metadata_offset_B +
+            agx_map_gpu(tex) + tex->layout.metadata_offset_B +
            (layer * tex->layout.compression_layer_stride_B);
      }

@ -3756,8 +3756,9 @@ agx_index_buffer_rsrc_ptr(struct agx_batch *batch,
   struct agx_resource *rsrc = agx_resource(info->index.resource);
   agx_batch_reads(batch, rsrc);

-   *extent = ALIGN_POT(rsrc->layout.size_B, 4);
-   return rsrc->bo->va->addr;
+   *extent =
+      ALIGN_POT(rsrc->layout.size_B - rsrc->layout.level_offsets_B[0], 4);
+   return agx_map_gpu(rsrc);
 }

 static uint64_t
@ -3948,7 +3949,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
      params.xfb_size[i] = size;

      if (rsrc) {
-         params.xfb_offs_ptrs[i] = rsrc->bo->va->addr;
+         params.xfb_offs_ptrs[i] = agx_map_gpu(rsrc);
         agx_batch_writes(batch, rsrc, 0);
         batch->incoherent_writes = true;
      }
@ -4054,7 +4055,7 @@ agx_indirect_buffer_ptr(struct agx_batch *batch,

   struct agx_resource *rsrc = agx_resource(indirect->buffer);
   agx_batch_reads(batch, rsrc);
-   return rsrc->bo->va->addr + indirect->offset;
+   return agx_map_gpu(rsrc) + indirect->offset;
 }

 static void
@ -5388,7 +5389,7 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
   if (info->indirect) {
      struct agx_resource *rsrc = agx_resource(info->indirect);
      agx_batch_reads(batch, rsrc);
-      indirect = rsrc->bo->va->addr + info->indirect_offset;
+      indirect = agx_map_gpu(rsrc) + info->indirect_offset;
   }

   /* Increment the pipeline stats query.
@ -5493,7 +5494,7 @@ agx_set_global_binding(struct pipe_context *pipe, unsigned first,
         struct agx_resource *rsrc = agx_resource(resources[i]);

         memcpy(&addr, handles[i], sizeof(addr));
-         addr += rsrc->bo->va->addr;
+         addr += agx_map_gpu(rsrc);
         memcpy(handles[i], &addr, sizeof(addr));
      } else {
         pipe_resource_reference(res, NULL);
@ -5534,7 +5535,7 @@ agx_decompress_inplace(struct agx_batch *batch, struct pipe_surface *surf,
                                 surf->last_layer - surf->first_layer + 1);

   libagx_decompress(batch, grid, AGX_BARRIER_ALL, layout, surf->first_layer,
-                     level, agx_map_texture_gpu(rsrc, 0), images.gpu);
+                     level, agx_map_gpu(rsrc), images.gpu);
 }

 void
--- a/src/gallium/drivers/asahi/agx_state.h
+++ b/src/gallium/drivers/asahi/agx_state.h
@ -970,10 +970,16 @@ agx_map_texture_cpu(struct agx_resource *rsrc, unsigned level, unsigned z)
          ail_get_layer_level_B(&rsrc->layout, z, level);
 }

+static inline uint64_t
+agx_map_gpu(struct agx_resource *rsrc)
+{
+   return rsrc->bo->va->addr + rsrc->layout.level_offsets_B[0];
+}
+
 static inline uint64_t
 agx_map_texture_gpu(struct agx_resource *rsrc, unsigned z)
 {
-   return rsrc->bo->va->addr +
+   return agx_map_gpu(rsrc) +
          (uint64_t)ail_get_layer_offset_B(&rsrc->layout, z);
 }

--- a/src/gallium/drivers/asahi/agx_streamout.c
+++ b/src/gallium/drivers/asahi/agx_streamout.c
@ -116,7 +116,7 @@ agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer,
                          target->buffer_size);

   *size = target->buffer_size;
-   return rsrc->bo->va->addr + target->buffer_offset;
+   return agx_map_gpu(rsrc) + target->buffer_offset;
 }

 void
--- a/src/gallium/drivers/asahi/agx_uniforms.c
+++ b/src/gallium/drivers/asahi/agx_uniforms.c
@ -3,12 +3,9 @@
 * SPDX-License-Identifier: MIT
 */
 #include <stdio.h>
-#include "asahi/genxml/agx_pack.h"
 #include "pipe/p_state.h"
-#include "util/format/u_format.h"
 #include "util/half_float.h"
 #include "util/macros.h"
-#include "agx_abi.h"
 #include "agx_device.h"
 #include "agx_state.h"
 #include "pool.h"
@ -19,8 +16,7 @@ agx_const_buffer_ptr(struct agx_batch *batch, struct pipe_constant_buffer *cb)
   if (cb->buffer) {
      struct agx_resource *rsrc = agx_resource(cb->buffer);
      agx_batch_reads(batch, rsrc);
-
-      return rsrc->bo->va->addr + cb->buffer_offset;
+      return agx_map_gpu(rsrc) + cb->buffer_offset;
   } else {
      return 0;
   }
@ -42,8 +38,9 @@ agx_upload_vbos(struct agx_batch *batch)
         struct agx_resource *rsrc = agx_resource(vb.buffer.resource);
         agx_batch_reads(batch, rsrc);

-         buffers[vbo] = rsrc->bo->va->addr + vb.buffer_offset;
-         buf_sizes[vbo] = rsrc->layout.size_B - vb.buffer_offset;
+         buffers[vbo] = agx_map_gpu(rsrc) + vb.buffer_offset;
+         buf_sizes[vbo] = rsrc->layout.size_B - vb.buffer_offset -
+                          rsrc->layout.level_offsets_B[0];
      }
   }

@ -144,7 +141,7 @@ agx_set_ssbo_uniforms(struct agx_batch *batch, mesa_shader_stage stage)
            agx_batch_reads(batch, rsrc);
         }

-         unif->ssbo_base[cb] = rsrc->bo->va->addr + sb->buffer_offset;
+         unif->ssbo_base[cb] = agx_map_gpu(rsrc) + sb->buffer_offset;
         unif->ssbo_size[cb] = st->ssbo[cb].buffer_size;
      } else {
         /* Invalid, so use the sink */
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@ -223,9 +223,9 @@ iris_apply_brw_tes_prog_data(struct iris_compiled_shader *shader,

   iris_apply_brw_vue_prog_data(&brw->base, &iris->base);

-   iris->partitioning         = brw->partitioning;
-   iris->output_topology      = brw->output_topology;
-   iris->domain               = brw->domain;
+   iris->partitioning         = brw_tess_info_partitioning(brw->tess_info);
+   iris->output_topology      = brw_tess_info_output_topology(brw->tess_info);
+   iris->domain               = brw_tess_info_domain(brw->tess_info);
   iris->include_primitive_id = brw->include_primitive_id;
 }

--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@ -338,7 +338,6 @@ attribs_update_simple(struct lp_build_interp_soa_context *bld,
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_build_context *coeff_bld = &bld->coeff_bld;
   struct lp_build_context *setup_bld = &bld->setup_bld;
-   LLVMValueRef oow = NULL;
   LLVMValueRef pixoffx;
   LLVMValueRef pixoffy;
   LLVMValueRef ptr;
@ -425,25 +424,23 @@ attribs_update_simple(struct lp_build_interp_soa_context *bld,
               }

               if (interp == LP_INTERP_PERSPECTIVE) {
-                  if (oow == NULL) {
-                     LLVMValueRef w;
-                     assert(attrib != 0);
-                     assert(bld->mask[0] & TGSI_WRITEMASK_W);
-                     if (bld->coverage_samples > 1 &&
-                         (loc == TGSI_INTERPOLATE_LOC_SAMPLE ||
-                          loc == TGSI_INTERPOLATE_LOC_CENTROID)) {
-                        /*
-                         * We can't use the precalculated 1/w since we didn't know
-                         * the actual position yet (we were assuming center).
-                         */
-                        LLVMValueRef indexw = lp_build_const_int32(gallivm, 3);
-                        w = interp_attrib_linear(bld, 0, indexw, chan_pixoffx, chan_pixoffy);
-                     }
-                     else {
-                        w = bld->attribs[0][3];
-                     }
-                     oow = lp_build_rcp(coeff_bld, w);
+                  LLVMValueRef w;
+                  assert(attrib != 0);
+                  assert(bld->mask[0] & TGSI_WRITEMASK_W);
+                  if (bld->coverage_samples > 1 &&
+                      (loc == TGSI_INTERPOLATE_LOC_SAMPLE ||
+                       loc == TGSI_INTERPOLATE_LOC_CENTROID)) {
+                     /*
+                      * We can't use the precalculated 1/w since we didn't know
+                      * the actual position yet (we were assuming center).
+                      */
+                     LLVMValueRef indexw = lp_build_const_int32(gallivm, 3);
+                     w = interp_attrib_linear(bld, 0, indexw, chan_pixoffx, chan_pixoffy);
                  }
+                  else {
+                     w = bld->attribs[0][3];
+                  }
+                  LLVMValueRef oow = lp_build_rcp(coeff_bld, w);
                  a = lp_build_mul(coeff_bld, a, oow);
               }

--- a/src/gallium/drivers/r300/ci/r300-rs740-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rs740-fails.txt
@ -1236,44 +1236,6 @@ spec@nv_texture_env_combine4@nv_texture_env_combine4-combine,Fail

 spec@oes_texture_float@oes_texture_float half,Fail

-# Remaining fallout from 9d359c6d10adb1cd2978a0e13714a3f98544aae8
-spec@arb_texture_compression@fbo-generatemipmap-formats,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA NPOT,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@gen-compressed-teximage,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT NPOT,Fail
-
 # uprev Piglit in Mesa
 spec@!opengl 1.1@teximage-scale-bias,Fail
 spec@glsl-1.10@execution@glsl-fs-texture2d-mipmap-const-bias-01,Fail
--- a/src/gallium/drivers/r300/ci/r300-rv410-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rv410-fails.txt
@ -1280,44 +1280,6 @@ spec@nv_texture_env_combine4@nv_texture_env_combine4-combine,Fail

 spec@oes_texture_float@oes_texture_float half,Fail

-# Remaining fallout from 9d359c6d10adb1cd2978a0e13714a3f98544aae8
-spec@arb_texture_compression@fbo-generatemipmap-formats,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA NPOT,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@gen-compressed-teximage,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT NPOT,Fail
-
 # uprev Piglit in Mesa
 spec@!opengl 1.1@teximage-scale-bias,Fail
 spec@glsl-1.10@execution@glsl-fs-texture2d-mipmap-const-bias-01,Fail
--- a/src/gallium/drivers/r300/ci/r300-rv530-nohiz-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rv530-nohiz-fails.txt
@ -778,78 +778,6 @@ dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_nearest_repeat,Fail
 dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_nearest_mirror,Fail
 dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_nearest_clamp,Fail

-# Remaining fallout from 9d359c6d10adb1cd2978a0e13714a3f98544aae8
-spec@arb_texture_compression@fbo-generatemipmap-formats,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA NPOT,Fail
-spec@ati_texture_compression_3dc@fbo-generatemipmap-formats,Fail
-spec@ati_texture_compression_3dc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI,Fail
-spec@ati_texture_compression_3dc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_ALPHA_LATC2_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_ALPHA_LATC2_EXT NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_LATC1_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_LATC1_EXT NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_LATC2_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_LATC2_EXT NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_LATC1_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_LATC1_EXT NPOT,Fail
-spec@ext_texture_compression_rgtc@compressedteximage gl_compressed_red_green_rgtc2_ext,Fail
-spec@ext_texture_compression_rgtc@compressedteximage gl_compressed_signed_red_green_rgtc2_ext,Fail
-spec@ext_texture_compression_rgtc@compressedteximage gl_compressed_signed_red_rgtc1_ext,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RED_RGTC1,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RED_RGTC1 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RG_RGTC2,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RG_RGTC2 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED_RGTC1,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED_RGTC1 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG_RGTC2,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG_RGTC2 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG NPOT,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@gen-compressed-teximage,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT NPOT,Fail
-
 # uprev Piglit in Mesa
 spec@!opengl 1.1@teximage-scale-bias,Fail
 spec@ext_framebuffer_multisample@accuracy all_samples color depthstencil linear,Fail
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@ -947,6 +947,32 @@ r300_set_framebuffer_state(struct pipe_context* pipe,
    util_framebuffer_init(pipe, state, r300->fb_cbufs, &r300->fb_zsbuf);
    util_copy_framebuffer_state(r300->fb_state.state, state);

+    /* DXTC blits require that blocks are 2x1 or 4x1 pixels, but
+     * pipe_surface_width sets the framebuffer width as if blocks were 1x1
+     * pixels. Override the width to correct that.
+     */
+    if (state->nr_cbufs == 1 && state->cbufs[0].texture &&
+        state->cbufs[0].format == PIPE_FORMAT_R8G8B8A8_UNORM &&
+        util_format_is_compressed(state->cbufs[0].texture->format)) {
+        struct pipe_framebuffer_state *fb =
+            (struct pipe_framebuffer_state*)r300->fb_state.state;
+        const struct util_format_description *desc =
+            util_format_description(state->cbufs[0].texture->format);
+        unsigned width = u_minify(state->cbufs[0].texture->width0,
+                                  state->cbufs[0].level);
+
+        assert(desc->block.width == 4 && desc->block.height == 4);
+
+        /* Each 64-bit DXT block is 2x1 pixels, and each 128-bit DXT
+         * block is 4x1 pixels when blitting.
+         */
+        width = align(width, 4); /* align to the DXT block width. */
+        if (desc->block.bits == 64)
+           width = DIV_ROUND_UP(width, 2);
+
+        fb->width = width;
+    }
+
    /* Remove trailing NULL colorbuffers. */
    while (current_state->nr_cbufs && !current_state->cbufs[current_state->nr_cbufs-1].texture)
        current_state->nr_cbufs--;
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@ -201,6 +201,7 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	rctx->b.set_vertex_buffers(&rctx->b, 1, &vbuffer);
 	util_draw_arrays_instanced(&rctx->b, R600_PRIM_RECTANGLE_LIST, 0, 3,
 				   0, num_instances);
+	pipe_resource_reference(&buf, NULL);
 }

 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@ -14,6 +14,7 @@
 #include "util/u_memory.h"
 #include "util/u_pack_color.h"
 #include "util/u_surface.h"
+#include "util/u_resource.h"
 #include "util/os_time.h"
 #include "frontend/winsys_handle.h"
 #include <errno.h>
@ -442,7 +443,7 @@ static bool r600_texture_get_param(struct pipe_screen *screen,

 	switch (param) {
 	case PIPE_RESOURCE_PARAM_NPLANES:
-		*value = 1;
+		*value = util_resource_num(resource);
 		return true;

 	case PIPE_RESOURCE_PARAM_STRIDE:
--- a/src/gallium/drivers/r600/sfn/sfn_instr_alugroup.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_instr_alugroup.cpp
@ -20,6 +20,16 @@ AluGroup::AluGroup()
   m_free_slots = has_t() ? 0x1f : 0xf;
 }

+void
+AluGroup::apply_add_instr(AluInstr *instr)
+{
+   instr->set_parent_group(this);
+   instr->pin_dest_to_chan();
+   m_has_kill_op |= instr->is_kill();
+   m_has_pred_update |= instr->has_alu_flag(alu_update_exec);
+   assert(!(m_has_kill_op && m_has_pred_update));
+}
+
 bool
 AluGroup::add_instruction(AluInstr *instr)
 {
@ -32,17 +42,13 @@ AluGroup::add_instruction(AluInstr *instr)
      ASSERTED auto opinfo = alu_ops.find(instr->opcode());
      assert(opinfo->second.can_channel(AluOp::t, s_chip_class));
      if (add_trans_instructions(instr)) {
-         instr->set_parent_group(this);
-         instr->pin_dest_to_chan();
-         m_has_kill_op |= instr->is_kill();
+         apply_add_instr(instr);
         return true;
      }
   }

   if (add_vec_instructions(instr) && !instr->has_alu_flag(alu_is_trans)) {
-      instr->set_parent_group(this);
-      instr->pin_dest_to_chan();
-      m_has_kill_op |= instr->is_kill();
+      apply_add_instr(instr);
      return true;
   }

@ -51,9 +57,7 @@ AluGroup::add_instruction(AluInstr *instr)

   if (s_max_slots > 4 && opinfo->second.can_channel(AluOp::t, s_chip_class) &&
       add_trans_instructions(instr)) {
-      instr->set_parent_group(this);
-      instr->pin_dest_to_chan();
-      m_has_kill_op |= instr->is_kill();
+      apply_add_instr(instr);
      return true;
   }

@ -128,6 +132,8 @@ AluGroup::add_trans_instructions(AluInstr *instr)
          * make sure the corresponding vector channel is used */
         assert(instr->has_alu_flag(alu_is_trans) || m_slots[instr->dest_chan()]);
         m_has_kill_op |= instr->is_kill();
+         m_has_pred_update |= instr->has_alu_flag(alu_update_exec);
+
         m_slot_assignemnt_order[m_next_slot_assignemnt++] = 4;
         return true;
      }
@ -170,17 +176,12 @@ AluGroup::add_vec_instructions(AluInstr *instr)
   if (!m_slots[preferred_chan]) {
      if (instr->bank_swizzle() != alu_vec_unknown) {
         if (try_readport(instr, instr->bank_swizzle())) {
-            m_has_kill_op |= instr->is_kill();
-            m_slot_assignemnt_order[m_next_slot_assignemnt++] = preferred_chan;
            return true;
         }
      } else {
         for (AluBankSwizzle i = alu_vec_012; i != alu_vec_unknown; ++i) {
-            if (try_readport(instr, i)) {
-               m_has_kill_op |= instr->is_kill();
-               m_slot_assignemnt_order[m_next_slot_assignemnt++] = preferred_chan;
+            if (try_readport(instr, i))
               return true;
-            }
         }
      }
   } else {
@ -209,18 +210,12 @@ AluGroup::add_vec_instructions(AluInstr *instr)
            sfn_log << SfnLog::schedule << "V: Try force channel " << free_chan << "\n";
            dest->set_chan(free_chan);
            if (instr->bank_swizzle() != alu_vec_unknown) {
-               if (try_readport(instr, instr->bank_swizzle())) {
-                  m_has_kill_op |= instr->is_kill();
-                  m_slot_assignemnt_order[m_next_slot_assignemnt++] = free_chan;
+               if (try_readport(instr, instr->bank_swizzle()))
                  return true;
-               }
            } else {
               for (AluBankSwizzle i = alu_vec_012; i != alu_vec_unknown; ++i) {
-                  if (try_readport(instr, i)) {
-                     m_has_kill_op |= instr->is_kill();
-                     m_slot_assignemnt_order[m_next_slot_assignemnt++] = free_chan;
+                  if (try_readport(instr, i))
                     return true;
-                  }
               }
            }
         }
@ -318,6 +313,9 @@ AluGroup::try_readport(AluInstr *instr, AluBankSwizzle cycle)
         else if (dest->pin() == pin_group)
            dest->set_pin(pin_chgr);
      }
+      m_has_kill_op |= instr->is_kill();
+      m_has_pred_update |= instr->has_alu_flag(alu_update_exec);
+      m_slot_assignemnt_order[m_next_slot_assignemnt++] = preferred_chan;
      return true;
   }
   return false;
--- a/src/gallium/drivers/r600/sfn/sfn_instr_alugroup.h
+++ b/src/gallium/drivers/r600/sfn/sfn_instr_alugroup.h
@ -21,6 +21,7 @@ public:
   using iterator = Slots::iterator;
   using const_iterator = Slots::const_iterator;

+   void extracted(AluInstr *& instr);
   bool add_instruction(AluInstr *instr);
   bool add_trans_instructions(AluInstr *instr);
   bool add_vec_instructions(AluInstr *instr);
@ -82,6 +83,7 @@ public:

   bool addr_for_src() const { return m_addr_for_src; }
   bool has_kill_op() const { return m_has_kill_op; }
+   bool has_update_exec() const { return m_has_pred_update; }

   void set_origin(AluInstr *o) { m_origin = o;}

@ -100,6 +102,8 @@ private:
   bool update_indirect_access(AluInstr *instr);
   bool try_readport(AluInstr *instr, AluBankSwizzle cycle);

+   void apply_add_instr(AluInstr *instr);
+
   Slots m_slots;
   uint8_t m_next_slot_assignemnt{0};
   std::array<int8_t, 5> m_slot_assignemnt_order{-1, -1, -1, -1, -1};
@ -119,6 +123,7 @@ private:
   bool m_addr_is_index{false};
   bool m_addr_for_src{false};
   bool m_has_kill_op{false};
+   bool m_has_pred_update{false};
   AluInstr *m_origin{nullptr};

   uint8_t m_free_slots;
--- a/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_scheduler.cpp
@ -869,8 +869,8 @@ BlockScheduler::schedule_alu_to_group_vec(AluGroup *group)
   bool success = false;
   auto i = alu_vec_ready.begin();
   auto e = alu_vec_ready.end();
-   bool group_has_kill = false;
-   bool group_has_update_pred = false;
+   bool group_has_kill = group->has_kill_op();
+   bool group_has_update_pred = group->has_update_exec();
   while (i != e) {
      sfn_log << SfnLog::schedule << "Try schedule to vec " << **i;

@ -945,6 +945,7 @@ BlockScheduler::schedule_alu_to_group_vec(AluGroup *group)
         success = true;

         group_has_kill |= is_kill;
+         group_has_update_pred |= does_update_pred;

         sfn_log << SfnLog::schedule << " success\n";
      } else {
@ -965,8 +966,20 @@ BlockScheduler::schedule_alu_multislot_to_group_vec(AluGroup *group, ValueFactor
   auto i = alu_multi_slot_ready.begin();
   auto e = alu_multi_slot_ready.end();

+   bool group_has_kill = group->has_kill_op();
+
   while (i != e && util_bitcount(group->free_slot_mask()) > 1) {

+      /* A kill instruction and a predicate update in the same
+       * group don't mix well, so skip adding a predicate changing
+       * multi-slot op if we already have a kill. (There are no
+       * multi-slot kill ops).
+       */
+      if (group_has_kill && (*i)->has_alu_flag(alu_update_exec)) {
+         ++i;
+         continue;
+      }
+
      auto dest = (*i)->dest();

      bool can_merge = false;
@ -1038,6 +1051,10 @@ BlockScheduler::schedule_alu_to_group_trans(AluGroup *group,
   bool success = false;
   auto i = readylist.begin();
   auto e = readylist.end();
+
+   bool group_has_kill = group->has_kill_op();
+   bool group_has_update_pred = group->has_update_exec();
+
   while (i != e) {

      if (check_array_reads(**i)) {
@ -1052,6 +1069,12 @@ BlockScheduler::schedule_alu_to_group_trans(AluGroup *group,
         continue;
      }

+      if ((group_has_kill && (*i)->has_alu_flag(alu_update_exec)) ||
+          (group_has_update_pred && (*i)->is_kill())) {
+         ++i;
+         continue;
+      }
+
      if (group->add_trans_instructions(*i)) {
         (*i)->pin_dest_to_chan();
         auto old_i = i;
--- a/src/gallium/drivers/r600/sfn/sfn_split_address_loads.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_split_address_loads.cpp
@ -88,8 +88,10 @@ class CollectDeps : public ConstRegisterVisitor {
 public:
   void visit(const Register& r) override
   {
-      for (auto p : r.parents())
-         add_dep(p);
+      for (auto p : r.parents()) {
+         if (instr->block_id() == p->block_id() && instr->index() < p->index())
+            add_dep(p);
+      }
   }
   void visit(const LocalArray& value) override {(void)value; UNREACHABLE("Array is not a value");}
   void visit(const LocalArrayValue& r) override
--- a/src/gallium/drivers/r600/sfn/tests/meson.build
+++ b/src/gallium/drivers/r600/sfn/tests/meson.build
@ -22,7 +22,7 @@ r600_test_dep = declare_dependency(

 if with_tests
   foreach t : ['valuefactory', 'value', 'instr', 'instrfromstring', 'liverange',
-                'optimizer', 'shaderfromstring', 'split_address_loads' ]
+                'optimizer', 'regression', 'shaderfromstring', 'split_address_loads' ]
   test(
       t,
       executable('test-@0@-r600-sfn'.format(t),
--- a/src/gallium/drivers/r600/sfn/tests/sfn_regression_test.cpp
+++ b/src/gallium/drivers/r600/sfn/tests/sfn_regression_test.cpp
@ -0,0 +1,65 @@
+#include "sfn_test_shaders.h"
+
+#include "../sfn_optimizer.h"
+#include "../sfn_ra.h"
+#include "../sfn_scheduler.h"
+#include "../sfn_shader.h"
+#include "../sfn_split_address_loads.h"
+
+using namespace r600;
+using std::ostringstream;
+
+TEST_F(TestShaderFromNir, CombineRegisterToTexSrc)
+{
+   const char *shader_input =
+      R"(VS
+CHIPCLASS EVERGREEN
+INPUT LOC:0
+INPUT LOC:1
+OUTPUT LOC:0 VARYING_SLOT:0 MASK:15
+OUTPUT LOC:1 VARYING_SLOT:32 MASK:3
+REGISTERS R1.xyzw R2.xyzw R6.y R7.x R8.y R9.x R10.y
+ARRAYS A3[2].zw
+SHADER
+BLOCK_START
+ALU MOV R10.y@free : R2.x@fully{sb} {W}
+ALU MOV R9.x@free : R2.y@fully{sb} {W}
+ALU MOV R8.y@free : R2.z@fully{sb} {W}
+ALU MOV R7.x@free : R2.w@fully{sb} {W}
+ALU MOV R6.y@free : I[0] {W}
+LOOP_BEGIN
+BLOCK_END
+BLOCK_START
+IF (( ALU PRED_SETGE_INT __.x : R6.y@free KC0[0].x {LEP} PUSH_BEFORE ))
+BLOCK_END
+BLOCK_START
+BREAK
+BLOCK_END
+BLOCK_START
+ENDIF
+BLOCK_END
+BLOCK_START
+ALU INT_TO_FLT CLAMP S22.y@free{s} : R6.y@free {W}
+ALU TRUNC S24.w@free{s} : S22.y@free{s} {W}
+ALU FLT_TO_INT S25.x@free{s} : S24.w@free{s} {W}
+ALU MOV A3[S25.x@free].z : R10.y@free {W}
+ALU MOV A3[S25.x@free].w : R9.x@free {W}
+ALU MUL_IEEE R5.x@free : R9.x@free I[0.5] {W}
+ALU MUL_IEEE R9.x@free : R8.y@free I[0.5] {W}
+ALU MUL_IEEE R8.y@free : R7.x@free I[0.5] {W}
+ALU MUL_IEEE R7.x@free : R10.y@free I[0.5] {W}
+ALU ADD_INT R6.y@free : R6.y@free I[1] {W}
+ALU MOV R10.y@free : R5.x@free {W}
+LOOP_END
+BLOCK_END
+BLOCK_START
+ALU ADD S47.z@group{s} : A3[0].z A3[1].z {W}
+ALU ADD S47.x@group{s} : A3[0].w A3[1].w {W}
+EXPORT_DONE PARAM 0 S47.zx__
+EXPORT_DONE POS 0 R1.xyzw
+BLOCK_END)";
+
+   auto sh = from_string(shader_input);
+   split_address_loads(*sh);
+   schedule(sh);
+}
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@ -56,6 +56,19 @@ static bool si_update_shaders(struct si_context *sctx)
   struct si_shader *old_ps = sctx->shader.ps.current;
   int r;

+   if (GFX_VERSION >= GFX9) {
+      /* For merged shaders, mark the next shader as dirty so its previous_stage is updated. */
+      if (is_vs_state_changed) {
+         if (HAS_TESS) {
+            is_tess_state_changed = true;
+         } else if (HAS_GS) {
+            is_gs_state_changed = true;
+         }
+      }
+      if ((sctx->dirty_shaders_mask & BITFIELD_BIT(MESA_SHADER_TESS_EVAL)) && HAS_GS && HAS_TESS)
+         is_gs_state_changed = true;
+   }
+
   /* Update TCS and TES. */
   if (HAS_TESS && is_tess_state_changed) {
      if (!sctx->has_tessellation) {
--- a/src/gallium/drivers/v3d/v3d_blit.c
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@ -690,6 +690,7 @@ v3d_get_sand8_fs(struct pipe_context *pctx, int cpp)
                nir_variable_create(b.shader, nir_var_shader_out,
                                    vec4, "f_color");
        color_out->data.location = FRAG_RESULT_COLOR;
+        b.shader->info.outputs_written |= BITFIELD_BIT(FRAG_RESULT_COLOR);

        nir_variable *pos_in =
                nir_variable_create(b.shader, nir_var_shader_in, vec4, "pos");
@ -998,6 +999,7 @@ v3d_get_sand30_fs(struct pipe_context *pctx)
                                                      nir_var_shader_out,
                                                      glsl_uvec4, "f_color");
        color_out->data.location = FRAG_RESULT_COLOR;
+        b.shader->info.outputs_written |= BITFIELD_BIT(FRAG_RESULT_COLOR);

        nir_variable *pos_in =
                nir_variable_create(b.shader, nir_var_shader_in, vec4, "pos");
--- a/src/gallium/drivers/zink/ci/zink-anv-adl-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-anv-adl-validation-settings.txt
@ -1,6 +1,6 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiIndexedEXT-format-07753,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-vkDestroyDevice-device-05137,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiIndexedEXT-format-07753,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-vkDestroyDevice-device-05137,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-08879
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-anv-cml-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-anv-cml-validation-settings.txt
@ -1,6 +1,6 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-vkCmdDrawMultiIndexedEXT-None-10909
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-anv-tgl-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-anv-tgl-validation-settings.txt
@ -1,6 +1,6 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiIndexedEXT-format-07753,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-vkDestroyDevice-device-05137,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiIndexedEXT-format-07753,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-vkDestroyDevice-device-05137
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-lavapipe-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-lavapipe-validation-settings.txt
@ -1,6 +1,6 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiEXT-None-02699,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdPipelineBarrier2-shaderTileImageColorReadAccess-08718,VUID-VkGraphicsPipelineCreateInfo-flags-06482,VUID-vkCmdPipelineBarrier2-None-08719,VUID-vkCmdDrawMultiEXT-rasterizationSamples-07474,VUID-vkDestroyDevice-device-05137,VUID-VkRectLayerKHR-offset-04864,VUID-vkAcquireNextImageKHR-semaphore-01779,VUID-vkQueueSubmit-pSignalSemaphores-00067,VUID-VkImageMemoryBarrier2-srcAccessMask-07454,UNASSIGNED-GeneralParameterError-RequiredHandle,VUID-VkImageMemoryBarrier2-image-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiEXT-None-02699,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdPipelineBarrier2-shaderTileImageColorReadAccess-08718,VUID-VkGraphicsPipelineCreateInfo-flags-06482,VUID-vkCmdPipelineBarrier2-None-08719,VUID-vkCmdDrawMultiEXT-rasterizationSamples-07474,VUID-vkDestroyDevice-device-05137,VUID-VkRectLayerKHR-offset-04864,VUID-vkAcquireNextImageKHR-semaphore-01779,VUID-vkQueueSubmit-pSignalSemaphores-00067,VUID-VkImageMemoryBarrier2-srcAccessMask-07454,UNASSIGNED-GeneralParameterError-RequiredHandle,VUID-VkImageMemoryBarrier2-image-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-radv-cezanne-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-radv-cezanne-validation-settings.txt
@ -1,7 +1,7 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
 #khronos_validation.message_id_filter = VUID-RuntimeSpirv-Location-06272,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiEXT-None-08879,VUID-vkCmdDrawMultiIndexedEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-radv-navi31-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-radv-navi31-validation-settings.txt
@ -1,7 +1,7 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
 #khronos_validation.message_id_filter = VUID-RuntimeSpirv-Location-06272,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiEXT-None-08879,VUID-vkCmdDrawMultiIndexedEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-VkRenderingAttachmentInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-VkRenderingAttachmentInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-radv-vangogh-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-radv-vangogh-validation-settings.txt
@ -1,7 +1,7 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
 #khronos_validation.message_id_filter = VUID-RuntimeSpirv-Location-06272,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiEXT-None-08879,VUID-vkCmdDrawMultiIndexedEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@ -3368,9 +3368,8 @@ begin_rendering(struct zink_context *ctx, bool check_msaa_expand)
      VK_TRUE,
      ctx->gfx_pipeline_state.rast_samples + 1,
   };
+   ctx->dynamic_fb.info.pNext = ctx->transient_attachments && !ctx->blitting && has_msrtss ? &msrtss : NULL;

-   if (has_msrtss && !ctx->blitting)
-      ctx->dynamic_fb.info.pNext = ctx->transient_attachments ? &msrtss : NULL;
   VKCTX(CmdBeginRendering)(ctx->bs->cmdbuf, &ctx->dynamic_fb.info);
   ctx->in_rp = true;
   return clear_buffers;
--- a/src/gallium/drivers/zink/zink_descriptors.c
+++ b/src/gallium/drivers/zink/zink_descriptors.c
@ -1725,6 +1725,7 @@ zink_descriptors_deinit(struct zink_context *ctx)
      VKSCR(DestroyDescriptorSetLayout)(screen->dev, ctx->dd.push_dsl[0]->layout, NULL);
   if (ctx->dd.push_dsl[1])
      VKSCR(DestroyDescriptorSetLayout)(screen->dev, ctx->dd.push_dsl[1]->layout, NULL);
+   VKSCR(DestroyDescriptorSetLayout)(screen->dev, ctx->dd.old_push_dsl, NULL);
 }

 /* called on screen creation */
@ -1766,7 +1767,8 @@ zink_descriptor_util_init_fbfetch(struct zink_context *ctx)
      return;

   struct zink_screen *screen = zink_screen(ctx->base.screen);
-   VKSCR(DestroyDescriptorSetLayout)(screen->dev, ctx->dd.push_dsl[0]->layout, NULL);
+   /* save this layout; it may be used by programs, and tracking that is extra complexity */
+   ctx->dd.old_push_dsl = ctx->dd.push_dsl[0]->layout;
   //don't free these now, let ralloc free on teardown to avoid invalid access
   //ralloc_free(ctx->dd.push_dsl[0]);
   //ralloc_free(ctx->dd.push_layout_keys[0]);
--- a/src/gallium/drivers/zink/zink_draw.cpp
+++ b/src/gallium/drivers/zink/zink_draw.cpp
@ -270,8 +270,7 @@ update_gfx_pipeline(struct zink_context *ctx, struct zink_batch_state *bs, enum
         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, true, false>(ctx, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
      else
         pipeline = zink_get_gfx_pipeline<DYNAMIC_STATE, false, false>(ctx, ctx->curr_program, &ctx->gfx_pipeline_state, mode);
-   }
-   if (pipeline) {
+      assert(pipeline);
      pipeline_changed = prev_pipeline != pipeline || ctx->shobj_draw;
      if (BATCH_CHANGED || pipeline_changed)
         VKCTX(CmdBindPipeline)(bs->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
@ -986,8 +985,7 @@ update_mesh_pipeline(struct zink_context *ctx, struct zink_batch_state *bs)
         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, true, true>(ctx, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
      else
         pipeline = zink_get_gfx_pipeline<ZINK_DYNAMIC_STATE3, false, true>(ctx, ctx->mesh_program, &ctx->gfx_pipeline_state, MESA_PRIM_COUNT);
-   }
-   if (pipeline) {
+      assert(pipeline);
      pipeline_changed = prev_pipeline != pipeline || ctx->shobj_draw;
      if (BATCH_CHANGED || pipeline_changed)
         VKCTX(CmdBindPipeline)(bs->cmdbuf, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline);
--- a/src/gallium/drivers/zink/zink_query.c
+++ b/src/gallium/drivers/zink/zink_query.c
@ -119,7 +119,9 @@ pipeline_statistic_convert(enum pipe_statistics_query_index idx)
      [PIPE_STAT_QUERY_PS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
      [PIPE_STAT_QUERY_HS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
      [PIPE_STAT_QUERY_DS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
-      [PIPE_STAT_QUERY_CS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT
+      [PIPE_STAT_QUERY_CS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT,
+      [PIPE_STAT_QUERY_MS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT,
+      [PIPE_STAT_QUERY_TS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT,
   };
   assert(idx < ARRAY_SIZE(map));
   return map[idx];
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@ -3133,6 +3133,12 @@ init_driver_workarounds(struct zink_screen *screen)
      screen->info.have_EXT_host_image_copy = false;
 }

+static void
+disable_features(struct zink_screen *screen)
+{
+   screen->info.mesh_feats.primitiveFragmentShadingRateMeshShader = false;
+}
+
 static void
 check_hic_shader_read(struct zink_screen *screen)
 {
@ -3513,6 +3519,7 @@ zink_internal_create_screen(const struct pipe_screen_config *config, int64_t dev
   check_hic_shader_read(screen);

   init_driver_workarounds(screen);
+   disable_features(screen);

   screen->dev = zink_create_logical_device(screen);
   if (!screen->dev)
--- a/src/gallium/drivers/zink/zink_types.h
+++ b/src/gallium/drivers/zink/zink_types.h
@ -437,6 +437,7 @@ struct zink_descriptor_data {
   uint8_t state_changed[ZINK_PIPELINE_MAX]; //gfx, compute, mesh
   struct zink_descriptor_layout_key *push_layout_keys[2]; //gfx, compute
   struct zink_descriptor_layout *push_dsl[2]; //gfx, compute
+   VkDescriptorSetLayout old_push_dsl; //the non-fbfetch layout; this can't be destroyed because it may be in use
   VkDescriptorUpdateTemplate push_template[2]; //gfx, compute

   struct zink_descriptor_layout *dummy_dsl;
--- a/src/gallium/frontends/lavapipe/lvp_acceleration_structure.c
+++ b/src/gallium/frontends/lavapipe/lvp_acceleration_structure.c
@ -574,7 +574,7 @@ lvp_encode_as(struct vk_acceleration_structure *dst, VkDeviceAddress intermediat
   /* The BVH exceeds the maximum depth supported by the traversal stack, 
    * flatten the offending parts of the tree.
    */
-   if (max_node_depth >= 24)
+   if (max_node_depth >= (geometry_type == VK_GEOMETRY_TYPE_INSTANCES_KHR ? LVP_MAX_TLAS_DEPTH : LVP_MAX_BLAS_DEPTH))
      lvp_flatten_as(header, ir_box_nodes, root_offset, node_depth, output);

   free(node_depth);
--- a/src/gallium/frontends/lavapipe/lvp_descriptor_set.c
+++ b/src/gallium/frontends/lavapipe/lvp_descriptor_set.c
@ -547,6 +547,7 @@ VKAPI_ATTR void VKAPI_CALL lvp_UpdateDescriptorSets(
                  desc[didx + p].functions = iview->planes[p].image_handle->functions;
               }
            } else {
+               memset(&desc[didx], 0, sizeof(desc[didx]) * bind_layout->stride);
               for (unsigned k = 0; k < bind_layout->stride; k++)
                  desc[didx + k].functions = device->null_image_handle->functions;
            }
@ -577,6 +578,7 @@ VKAPI_ATTR void VKAPI_CALL lvp_UpdateDescriptorSets(
               lp_jit_image_from_pipe(&desc[j].image, &bview->iv);
               desc[j].functions = bview->image_handle->functions;
            } else {
+               memset(&desc[j].image, 0, sizeof(desc[j].image));
               desc[j].functions = device->null_image_handle->functions;
            }
         }
@ -846,6 +848,7 @@ lvp_descriptor_set_update_with_template(VkDevice _device, VkDescriptorSet descri
                  desc[idx + p].functions = iview->planes[p].image_handle->functions;
               }
            } else {
+               memset(&desc[idx], 0, sizeof(desc[idx]) * bind_layout->stride);
               for (unsigned k = 0; k < bind_layout->stride; k++)
                  desc[idx + k].functions = device->null_image_handle->functions;
            }
@ -872,6 +875,7 @@ lvp_descriptor_set_update_with_template(VkDevice _device, VkDescriptorSet descri
               lp_jit_image_from_pipe(&desc[idx].image, &bview->iv);
               desc[idx].functions = bview->image_handle->functions;
            } else {
+               memset(&desc[idx].image, 0, sizeof(desc[idx].image));
               desc[idx].functions = device->null_image_handle->functions;
            }
            break;
@ -1073,8 +1077,9 @@ VKAPI_ATTR void VKAPI_CALL lvp_GetDescriptorEXT(
            desc[p].functions = iview->planes[p].image_handle->functions;
         }
      } else {
-         unsigned plane_count = size / sizeof(struct lp_descriptor);
+         memset(desc, 0, size);

+         unsigned plane_count = size / sizeof(struct lp_descriptor);
         for (unsigned p = 0; p < plane_count; p++)
            desc[p].functions = device->null_image_handle->functions;
      }
@ -1087,6 +1092,7 @@ VKAPI_ATTR void VKAPI_CALL lvp_GetDescriptorEXT(
         lp_jit_bindless_texture_buffer_from_bda(&desc->texture, (void*)(uintptr_t)bda->address);
         desc->functions = get_texture_handle_bda(device, bda->address, bda->range, pformat).functions;
      } else {
+         memset(desc, 0, size);
         desc->functions = device->null_texture_handle->functions;
         desc->texture.sampler_index = 0;
      }
@ -1099,6 +1105,7 @@ VKAPI_ATTR void VKAPI_CALL lvp_GetDescriptorEXT(
         lp_jit_image_buffer_from_bda(&desc->image, (void *)(uintptr_t)bda->address, bda->range, pformat);
         desc->functions = get_image_handle_bda(device, bda->address, bda->range, pformat).functions;
      } else {
+         memset(desc, 0, size);
         desc->functions = device->null_image_handle->functions;
      }
      break;
--- a/src/gallium/frontends/lavapipe/lvp_device.c
+++ b/src/gallium/frontends/lavapipe/lvp_device.c
@ -1271,8 +1271,8 @@ lvp_get_properties(const struct lvp_physical_device *device, struct vk_propertie

      /* VK_KHR_acceleration_structure */
      .maxGeometryCount = (1 << 24) - 1,
-      .maxInstanceCount = (1 << 24) - 1,
-      .maxPrimitiveCount = (1 << 24) - 1,
+      .maxInstanceCount = (1 << LVP_MAX_TLAS_DEPTH) - 1,
+      .maxPrimitiveCount = (1 << LVP_MAX_BLAS_DEPTH) - 1,
      .maxPerStageDescriptorAccelerationStructures = MAX_DESCRIPTORS,
      .maxPerStageDescriptorUpdateAfterBindAccelerationStructures = MAX_DESCRIPTORS,
      .maxDescriptorSetAccelerationStructures = MAX_DESCRIPTORS,
--- a/src/gallium/frontends/lavapipe/lvp_private.h
+++ b/src/gallium/frontends/lavapipe/lvp_private.h
@ -100,13 +100,16 @@ extern "C" {
 #define MAX_DESCRIPTORS 1000000 /* Required by vkd3d-proton */
 #define MAX_PUSH_CONSTANTS_SIZE 256
 #define MAX_PUSH_DESCRIPTORS 32
-#define MAX_DESCRIPTOR_UNIFORM_BLOCK_SIZE 4096
+#define MAX_DESCRIPTOR_UNIFORM_BLOCK_SIZE MAX_DESCRIPTORS
 #define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BLOCKS 8
 #define MAX_DGC_STREAMS 16
 #define MAX_DGC_TOKENS 16
 /* Currently lavapipe does not support more than 1 image plane */
 #define LVP_MAX_PLANE_COUNT 1

+#define LVP_MAX_TLAS_DEPTH 24
+#define LVP_MAX_BLAS_DEPTH 29
+
 #ifdef _WIN32
 #define lvp_printflike(a, b)
 #else
--- a/src/gallium/frontends/lavapipe/lvp_ray_tracing_pipeline.c
+++ b/src/gallium/frontends/lavapipe/lvp_ray_tracing_pipeline.c
@ -356,7 +356,7 @@ lvp_ray_traversal_state_init(nir_function_impl *impl, struct lvp_ray_traversal_s
   state->current_node = nir_local_variable_create(impl, glsl_uint_type(), "traversal.current_node");
   state->stack_base = nir_local_variable_create(impl, glsl_uint_type(), "traversal.stack_base");
   state->stack_ptr = nir_local_variable_create(impl, glsl_uint_type(), "traversal.stack_ptr");
-   state->stack = nir_local_variable_create(impl, glsl_array_type(glsl_uint_type(), 24 * 2, 0), "traversal.stack");
+   state->stack = nir_local_variable_create(impl, glsl_array_type(glsl_uint_type(), LVP_MAX_TLAS_DEPTH + LVP_MAX_BLAS_DEPTH, 0), "traversal.stack");
   state->hit = nir_local_variable_create(impl, glsl_bool_type(), "traversal.hit");

   state->instance_addr = nir_local_variable_create(impl, glsl_uint64_t_type(), "traversal.instance_addr");
--- a/src/gallium/frontends/lavapipe/nir/lvp_nir_lower_ray_queries.c
+++ b/src/gallium/frontends/lavapipe/nir/lvp_nir_lower_ray_queries.c
@ -171,7 +171,8 @@ init_ray_query_traversal_vars(void *ctx, nir_shader *shader, unsigned array_leng
   result.stack_base =
      rq_variable_create(ctx, shader, array_length, glsl_uint_type(), VAR_NAME("_stack_base"));
   result.stack_ptr = rq_variable_create(ctx, shader, array_length, glsl_uint_type(), VAR_NAME("_stack_ptr"));
-   result.stack = rq_variable_create(ctx, shader, array_length, glsl_array_type(glsl_uint_type(), 24 * 2, 0), VAR_NAME("_stack"));
+   result.stack = rq_variable_create(ctx, shader, array_length,
+                                     glsl_array_type(glsl_uint_type(), LVP_MAX_TLAS_DEPTH + LVP_MAX_BLAS_DEPTH, 0), VAR_NAME("_stack"));
   return result;
 }

--- a/src/gallium/winsys/radeon/drm/meson.build
+++ b/src/gallium/winsys/radeon/drm/meson.build
@ -1,6 +1,14 @@
 # Copyright © 2017 Dylan Baker
 # SPDX-License-Identifier: MIT

+libradeonwinsys_deps = [idep_mesautil, dep_libdrm]
+libradeonwinsys_c_args = []
+
+if with_gallium_radeonsi
+  libradeonwinsys_deps += [idep_amdgfxregs_h]
+  libradeonwinsys_c_args = ['-DHAVE_GALLIUM_RADEONSI']
+endif
+
 libradeonwinsys = static_library(
  'radeonwinsys',
  files('radeon_drm_bo.c',
@ -14,5 +22,6 @@ libradeonwinsys = static_library(
        'radeon_surface.h'),
  include_directories : [inc_src, inc_include, inc_gallium, inc_gallium_aux],
  gnu_symbol_visibility : 'hidden',
-  dependencies : [idep_mesautil, dep_libdrm],
+  c_args : libradeonwinsys_c_args,
+  dependencies : libradeonwinsys_deps,
 )
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@ -8,6 +8,10 @@
 #include "radeon_drm_bo.h"
 #include "radeon_drm_cs.h"

+#ifdef HAVE_GALLIUM_RADEONSI
+#include "amdgfxregs.h"
+#endif
+
 #include "util/os_file.h"
 #include "util/simple_mtx.h"
 #include "util/thread_sched.h"
@ -105,6 +109,73 @@ static bool radeon_get_drm_value(int fd, unsigned request,
   return true;
 }

+static void get_hs_info(struct radeon_info *info)
+{
+   /* This is the size of all TCS outputs in memory per workgroup.
+    * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K.
+    */
+   unsigned max_hs_out_vram_dwords_per_wg = info->family == CHIP_HAWAII ? 4096 : 8192;
+   unsigned max_workgroups_per_se;
+
+#ifdef HAVE_GALLIUM_RADEONSI /* for gfx6+ register definitions */
+   unsigned max_hs_out_vram_dwords_enum = 0;
+
+   switch (max_hs_out_vram_dwords_per_wg) {
+   case 8192:
+      max_hs_out_vram_dwords_enum = V_03093C_X_8K_DWORDS;
+      break;
+   case 4096:
+      max_hs_out_vram_dwords_enum = V_03093C_X_4K_DWORDS;
+      break;
+   case 2048:
+      max_hs_out_vram_dwords_enum = V_03093C_X_2K_DWORDS;
+      break;
+   case 1024:
+      max_hs_out_vram_dwords_enum = V_03093C_X_1K_DWORDS;
+      break;
+   default:
+      UNREACHABLE("invalid TCS workgroup size");
+   }
+#endif
+
+   /* Gfx7 should limit num_workgroups to 508 (127 per SE)
+    * Gfx6 should limit num_workgroups to 126 (63 per SE)
+    */
+   if (info->gfx_level == GFX7) {
+      max_workgroups_per_se = 127;
+   } else {
+      max_workgroups_per_se = 63;
+   }
+
+   /* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB.
+    * Note that the offchip allocation isn't deallocated until the corresponding TES waves finish.
+    */
+   unsigned num_offchip_wg_per_cu = 4;
+   unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa *
+                                         info->max_sa_per_se, max_workgroups_per_se);
+   unsigned num_workgroups = num_workgroups_per_se * info->max_se;
+
+#ifdef HAVE_GALLIUM_RADEONSI /* for gfx6+ register definitions */
+   if (info->gfx_level == GFX7) {
+      info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups) |
+                               S_03093C_OFFCHIP_GRANULARITY_GFX7(max_hs_out_vram_dwords_enum);
+   } else {
+      info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) |
+                               S_0089B0_OFFCHIP_GRANULARITY(max_hs_out_vram_dwords_enum);
+   }
+#endif
+
+   /* The typical size of tess factors of 1 TCS workgroup if all patches are triangles. */
+   unsigned typical_tess_factor_size_per_wg = (192 / 3) * 16;
+   unsigned num_tess_factor_wg_per_cu = 3;
+
+   info->hs_offchip_workgroup_dw_size = max_hs_out_vram_dwords_per_wg;
+   info->tess_offchip_ring_size = num_workgroups * max_hs_out_vram_dwords_per_wg * 4;
+   info->tess_factor_ring_size = typical_tess_factor_size_per_wg * num_tess_factor_wg_per_cu *
+                                 info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se;
+   info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size;
+}
+
 /* Helper function to do the ioctls needed for setup and init. */
 static bool do_winsys_init(struct radeon_drm_winsys *ws)
 {
@ -639,6 +710,9 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
   default:;
   }

+   if (ws->gen == DRV_SI)
+      get_hs_info(&ws->info);
+
   ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL ||
                                                                            strstr(debug_get_option("AMD_DEBUG", ""), "check_vm") != NULL;
   ws->noop_cs = debug_get_bool_option("RADEON_NOOP", false);
--- a/src/intel/compiler/brw/brw_compile_tcs.cpp
+++ b/src/intel/compiler/brw/brw_compile_tcs.cpp
@ -196,6 +196,9 @@ brw_compile_tcs(const struct brw_compiler *compiler,

   brw_prog_data_init(&prog_data->base.base, &params->base);

+   brw_fill_tess_info_from_shader_info(&prog_data->tess_info,
+                                       &nir->info);
+
   nir->info.outputs_written = key->outputs_written;
   nir->info.patch_outputs_written = key->patch_outputs_written;

@ -221,6 +224,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);

   prog_data->input_vertices = key->input_vertices;
+   prog_data->output_vertices = nir->info.tess.tcs_vertices_out;
   prog_data->patch_count_threshold = get_patch_count_threshold(key->input_vertices);

   if (compiler->use_tcs_multi_patch) {
--- a/Show more
+++ b/Show more