VERSION: bump for rc3

Signed-off-by: Dylan Baker <dylan.c.baker@intel.com>
spirv: don't set in_bounds for structs
2025-12-30 12:10:09 +01:00 · 2025-10-30 11:35:06 -07:00 · 2025-10-28 12:53:18 -07:00 · 2025-10-28 12:53:18 -07:00 · 2025-10-28 12:53:17 -07:00 · 2025-10-28 12:53:16 -07:00
87 changed files with 6012 additions and 1615 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@ -1 +1 @@
-25.3.0-devel
+25.3.0-rc3
--- a/docs/ci/LAVA.rst
+++ b/docs/ci/LAVA.rst
@ -122,9 +122,8 @@ Enable the site and restart nginx:
   # Second download should be cached.
   wget http://localhost/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public/itoral-gl-terrain-demo/demo-v2.trace

-Now, set ``download-url`` in your ``traces-*.yml`` entry to something like
-``http://caching-proxy/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public``
-and you should have cached downloads for traces.  Add it to
-``FDO_HTTP_CACHE_URI=`` in your ``config.toml`` runner environment lines and you
-can use it for cached artifact downloads instead of going all the way to
-freedesktop.org on each job.
+The trace runner script automatically sets the caching proxy, so there's no
+need to modify anything in the Mesa CI YAML files.
+Add ``LAVA_HTTP_CACHE_URI=http://localhost/cache/?uri=`` to your ``config.toml``
+runner environment lines and you can use it for cached artifact downloads
+instead of going all the way to freedesktop.org on each job.
--- a/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c
+++ b/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c
@ -109,23 +109,37 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
   nir_mem_access_size_align res;

   if (intrin == nir_intrinsic_load_shared || intrin == nir_intrinsic_store_shared) {
-      /* Split unsupported shared access. */
-      res.bit_size = MIN2(bit_size, combined_align * 8ull);
-      res.align = res.bit_size / 8;
      /* Don't use >64-bit LDS loads for performance reasons. */
      unsigned max_bytes = intrin == nir_intrinsic_store_shared && cb_data->gfx_level >= GFX7 ? 16 : 8;
      bytes = MIN3(bytes, combined_align, max_bytes);
      bytes = bytes == 12 ? bytes : round_down_to_power_of_2(bytes);
+
+      /* Split unsupported shared access. */
+      res.bit_size = MIN2(bit_size, bytes * 8ull);
+      res.align = res.bit_size / 8;
      res.num_components = bytes / res.align;
      res.shift = nir_mem_access_shift_method_bytealign_amd;
      return res;
   }

+   const bool is_buffer_load = intrin == nir_intrinsic_load_ubo ||
+                               intrin == nir_intrinsic_load_ssbo ||
+                               intrin == nir_intrinsic_load_constant;
+
   if (is_smem) {
+      const bool supported_subdword = cb_data->gfx_level >= GFX12 &&
+                                      intrin != nir_intrinsic_load_push_constant &&
+                                      (!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo);
+
      /* Round up subdword loads if unsupported. */
-      const bool supported_subdword = cb_data->gfx_level >= GFX12 && intrin != nir_intrinsic_load_push_constant;
-      if (bit_size < 32 && (bytes >= 3 || !supported_subdword))
+      if (bytes <= 2 && combined_align % bytes == 0 && supported_subdword) {
+         bit_size = bytes * 8;
+      } else if (bytes % 4 || combined_align % 4) {
+         if (is_buffer_load)
+            bytes += 4 - MIN2(combined_align, 4);
         bytes = align(bytes, 4);
+         bit_size = 32;
+      }

      /* Generally, require an alignment of 4. */
      res.align = MIN2(4, bytes);
@ -138,9 +152,6 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
      if (!util_is_power_of_two_nonzero(bytes) && (cb_data->gfx_level < GFX12 || bytes != 12)) {
         const uint8_t larger = util_next_power_of_two(bytes);
         const uint8_t smaller = larger / 2;
-         const bool is_buffer_load = intrin == nir_intrinsic_load_ubo ||
-                                     intrin == nir_intrinsic_load_ssbo ||
-                                     intrin == nir_intrinsic_load_constant;
         const bool is_aligned = align_mul % smaller == 0;

         /* Overfetch up to 1 dword if this is a bounds-checked buffer load or the access is aligned. */
@ -185,8 +196,8 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui

   const uint32_t max_pad = 4 - MIN2(combined_align, 4);

-   /* Global loads don't have bounds checking, so increasing the size might not be safe. */
-   if (intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_load_global_constant) {
+   /* Global/scratch loads don't have bounds checking, so increasing the size might not be safe. */
+   if (!is_buffer_load) {
      if (align_mul < 4) {
         /* If we split the load, only lower it to 32-bit if this is a SMEM load. */
         const unsigned chunk_bytes = align(bytes, 4) - max_pad;
--- a/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
@ -508,6 +508,8 @@ lower_ms_intrinsic(nir_builder *b, nir_instr *instr, void *state)
      return update_ms_barrier(b, intrin, s);
   case nir_intrinsic_load_workgroup_index:
      return lower_ms_load_workgroup_index(b, intrin, s);
+   case nir_intrinsic_load_num_subgroups:
+      return nir_imm_int(b, DIV_ROUND_UP(s->api_workgroup_size, s->wave_size));
   case nir_intrinsic_set_vertex_and_primitive_count:
      return lower_ms_set_vertex_and_primitive_count(b, intrin, s);
   default:
@ -529,6 +531,7 @@ filter_ms_intrinsic(const nir_instr *instr,
          intrin->intrinsic == nir_intrinsic_store_per_primitive_output ||
          intrin->intrinsic == nir_intrinsic_barrier ||
          intrin->intrinsic == nir_intrinsic_load_workgroup_index ||
+          intrin->intrinsic == nir_intrinsic_load_num_subgroups ||
          intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count;
 }

--- a/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp
+++ b/src/amd/compiler/instruction_selection/aco_select_rt_prolog.cpp
@ -214,6 +214,8 @@ select_rt_prolog(Program* program, ac_shader_config* config,
   bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
            Operand::c32_or_c64(-1u, program->wave_size == 64),
            Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
+   bld.sop2(aco_opcode::s_cselect_b32, Definition(out_launch_size_y, s1),
+            Operand(out_launch_size_y, s1), Operand::c32(1), Operand(scc, s1));
   bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
            Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
   bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -8114,6 +8114,8 @@ radv_mark_descriptors_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPo
   struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);

   descriptors_state->dirty |= descriptors_state->valid;
+   if (descriptors_state->dynamic_offset_count)
+      descriptors_state->dirty_dynamic = true;
 }

 static void
@ -8642,7 +8644,6 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline

      if (cmd_buffer->state.compute_pipeline == compute_pipeline)
         return;
-      radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);

      radv_bind_shader(cmd_buffer, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], MESA_SHADER_COMPUTE);

@ -8656,7 +8657,6 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline

      if (cmd_buffer->state.rt_pipeline == rt_pipeline)
         return;
-      radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);

      radv_bind_shader(cmd_buffer, rt_pipeline->base.base.shaders[MESA_SHADER_INTERSECTION], MESA_SHADER_INTERSECTION);
      radv_bind_rt_prolog(cmd_buffer, rt_pipeline->prolog);
@ -8690,7 +8690,6 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline

      if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
         return;
-      radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);

      radv_foreach_stage (
         stage, (cmd_buffer->state.active_stages | graphics_pipeline->active_stages) & RADV_GRAPHICS_STAGE_BITS) {
@ -8744,6 +8743,8 @@ radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipeline
   cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].dynamic_offset_count = pipeline->dynamic_offset_count;
   cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].need_indirect_descriptors =
      pipeline->need_indirect_descriptors;
+
+   radv_mark_descriptors_dirty(cmd_buffer, pipelineBindPoint);
 }

 VKAPI_ATTR void VKAPI_CALL
--- a/src/amd/vulkan/radv_sqtt.c
+++ b/src/amd/vulkan/radv_sqtt.c
@ -508,7 +508,9 @@ radv_begin_sqtt(struct radv_queue *queue)
      device->sqtt.start_cs[family] = NULL;
   }

-   cs.b = ws->cs_create(ws, radv_queue_ring(queue), false);
+   radv_init_cmd_stream(&cs, radv_queue_ring(queue));
+
+   cs.b = ws->cs_create(ws, cs.hw_ip, false);
   if (!cs.b)
      return false;

@ -585,7 +587,9 @@ radv_end_sqtt(struct radv_queue *queue)
      device->sqtt.stop_cs[family] = NULL;
   }

-   cs.b = ws->cs_create(ws, radv_queue_ring(queue), false);
+   radv_init_cmd_stream(&cs, radv_queue_ring(queue));
+
+   cs.b = ws->cs_create(ws, cs.hw_ip, false);
   if (!cs.b)
      return false;

--- a/src/amd/vulkan/radv_video.c
+++ b/src/amd/vulkan/radv_video.c
@ -819,6 +819,32 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
   if (cap && !cap->valid)
      cap = NULL;

+   if (cap) {
+      pCapabilities->maxCodedExtent.width = cap->max_width;
+      pCapabilities->maxCodedExtent.height = cap->max_height;
+   } else {
+      switch (pVideoProfile->videoCodecOperation) {
+      case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+         pCapabilities->maxCodedExtent.width = (pdev->info.family < CHIP_TONGA) ? 2048 : 4096;
+         pCapabilities->maxCodedExtent.height = (pdev->info.family < CHIP_TONGA) ? 1152 : 4096;
+         break;
+      case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+         pCapabilities->maxCodedExtent.width =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
+         pCapabilities->maxCodedExtent.height =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
+         break;
+      case VK_VIDEO_CODEC_OPERATION_DECODE_VP9_BIT_KHR:
+         pCapabilities->maxCodedExtent.width =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
+         pCapabilities->maxCodedExtent.height =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
+         break;
+      default:
+         break;
+      }
+   }
+
   pCapabilities->flags = 0;
   pCapabilities->pictureAccessGranularity.width = VK_VIDEO_H264_MACROBLOCK_WIDTH;
   pCapabilities->pictureAccessGranularity.height = VK_VIDEO_H264_MACROBLOCK_HEIGHT;
@ -1126,32 +1152,6 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
      break;
   }

-   if (cap) {
-      pCapabilities->maxCodedExtent.width = cap->max_width;
-      pCapabilities->maxCodedExtent.height = cap->max_height;
-   } else {
-      switch (pVideoProfile->videoCodecOperation) {
-      case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
-         pCapabilities->maxCodedExtent.width = (pdev->info.family < CHIP_TONGA) ? 2048 : 4096;
-         pCapabilities->maxCodedExtent.height = (pdev->info.family < CHIP_TONGA) ? 1152 : 4096;
-         break;
-      case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
-         pCapabilities->maxCodedExtent.width =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
-         pCapabilities->maxCodedExtent.height =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
-         break;
-      case VK_VIDEO_CODEC_OPERATION_DECODE_VP9_BIT_KHR:
-         pCapabilities->maxCodedExtent.width =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
-         pCapabilities->maxCodedExtent.height =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
-         break;
-      default:
-         break;
-      }
-   }
-
   return VK_SUCCESS;
 }

--- a/src/amd/vulkan/radv_video_enc.c
+++ b/src/amd/vulkan/radv_video_enc.c
@ -890,7 +890,6 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
   uint32_t num_bits[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0};
   const struct VkVideoEncodeH264PictureInfoKHR *h264_picture_info =
      vk_find_struct_const(enc_info->pNext, VIDEO_ENCODE_H264_PICTURE_INFO_KHR);
-   int slice_count = h264_picture_info->naluSliceEntryCount;
   const StdVideoEncodeH264PictureInfo *pic = h264_picture_info->pStdPictureInfo;
   const StdVideoH264SequenceParameterSet *sps =
      vk_video_find_h264_enc_std_sps(cmd_buffer->video.params, pic->seq_parameter_set_id);
@ -903,8 +902,6 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
   unsigned int cdw_filled = 0;
   unsigned int bits_copied = 0;

-   assert(slice_count <= 1);
-
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
   const struct radv_physical_device *pdev = radv_device_physical(device);
   struct radv_cmd_stream *cs = cmd_buffer->cs;
@ -3166,6 +3163,36 @@ radv_video_patch_encode_session_parameters(struct radv_device *device, struct vk
      }
      break;
   case VK_VIDEO_CODEC_OPERATION_ENCODE_H265_BIT_KHR: {
+      for (unsigned i = 0; i < params->h265_enc.h265_sps_count; i++) {
+         uint32_t pic_width_in_luma_samples =
+             params->h265_enc.h265_sps[i].base.pic_width_in_luma_samples;
+         uint32_t pic_height_in_luma_samples =
+             params->h265_enc.h265_sps[i].base.pic_height_in_luma_samples;
+         uint32_t aligned_pic_width = align(pic_width_in_luma_samples, 64);
+         uint32_t aligned_pic_height = align(pic_height_in_luma_samples, 16);
+
+         /* Override the unaligned pic_{width,height} and make up for it with conformance window
+          * cropping */
+         params->h265_enc.h265_sps[i].base.pic_width_in_luma_samples = aligned_pic_width;
+         params->h265_enc.h265_sps[i].base.pic_height_in_luma_samples = aligned_pic_height;
+
+         if (aligned_pic_width != pic_width_in_luma_samples ||
+             aligned_pic_height != pic_height_in_luma_samples) {
+            params->h265_enc.h265_sps[i].base.flags.conformance_window_flag = 1;
+            params->h265_enc.h265_sps[i].base.conf_win_right_offset +=
+               (aligned_pic_width - pic_width_in_luma_samples) / 2;
+            params->h265_enc.h265_sps[i].base.conf_win_bottom_offset +=
+               (aligned_pic_height - pic_height_in_luma_samples) / 2;
+         }
+
+         /* VCN supports only the following block sizes (resulting in 64x64 CTBs with any coding
+          * block size) */
+         params->h265_enc.h265_sps[i].base.log2_min_luma_coding_block_size_minus3 = 0;
+         params->h265_enc.h265_sps[i].base.log2_diff_max_min_luma_coding_block_size = 3;
+         params->h265_enc.h265_sps[i].base.log2_min_luma_transform_block_size_minus2 = 0;
+         params->h265_enc.h265_sps[i].base.log2_diff_max_min_luma_transform_block_size = 3;
+      }
+
      for (unsigned i = 0; i < params->h265_enc.h265_pps_count; i++) {
         /* cu_qp_delta needs to be enabled if rate control is enabled. VCN2 and newer can also enable
          * it with rate control disabled. Since we don't know what rate control will be used, we
@ -3268,6 +3295,14 @@ radv_GetEncodedVideoSessionParametersKHR(VkDevice device,
         assert(sps);
         char *data_ptr = pData ? (char *)pData + vps_size : NULL;
         vk_video_encode_h265_sps(sps, size_limit, &sps_size, data_ptr);
+
+         if (pFeedbackInfo) {
+            struct VkVideoEncodeH265SessionParametersFeedbackInfoKHR *h265_feedback_info =
+               vk_find_struct(pFeedbackInfo->pNext, VIDEO_ENCODE_H265_SESSION_PARAMETERS_FEEDBACK_INFO_KHR);
+            pFeedbackInfo->hasOverrides = VK_TRUE;
+            if (h265_feedback_info)
+               h265_feedback_info->hasStdSPSOverrides = VK_TRUE;
+         }
      }
      if (h265_get_info->writeStdPPS) {
         const StdVideoH265PictureParameterSet *pps = vk_video_find_h265_enc_std_pps(templ, h265_get_info->stdPPSId);
--- a/src/asahi/ci/asahi-g13g-fails.txt
+++ b/src/asahi/ci/asahi-g13g-fails.txt
--- a/src/asahi/vulkan/hk_cmd_draw.c
+++ b/src/asahi/vulkan/hk_cmd_draw.c
@ -992,28 +992,34 @@ hk_CmdEndRendering(VkCommandBuffer commandBuffer)
   }
 }

+static void
+hk_init_heap(const void *data) {
+   struct hk_cmd_buffer *cmd = (struct hk_cmd_buffer *) data;
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   perf_debug(cmd, "Allocating heap");
+
+   size_t size = 128 * 1024 * 1024;
+   dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
+
+   /* The geometry state buffer is initialized here and then is treated by
+    * the CPU as rodata, even though the GPU uses it for scratch internally.
+    */
+   off_t off = dev->rodata.heap - dev->rodata.bo->va->addr;
+   struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off;
+
+   *map = (struct agx_heap){
+      .base = dev->heap->va->addr,
+      .size = size,
+   };
+}
+
 static uint64_t
 hk_heap(struct hk_cmd_buffer *cmd)
 {
   struct hk_device *dev = hk_cmd_buffer_device(cmd);

-   if (unlikely(!dev->heap)) {
-      perf_debug(cmd, "Allocating heap");
-
-      size_t size = 128 * 1024 * 1024;
-      dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
-
-      /* The geometry state buffer is initialized here and then is treated by
-       * the CPU as rodata, even though the GPU uses it for scratch internally.
-       */
-      off_t off = dev->rodata.heap - dev->rodata.bo->va->addr;
-      struct agx_heap *map = agx_bo_map(dev->rodata.bo) + off;
-
-      *map = (struct agx_heap){
-         .base = dev->heap->va->addr,
-         .size = size,
-      };
-   }
+   util_call_once_data(&dev->heap_init_once, hk_init_heap, cmd);

   /* We need to free all allocations after each command buffer execution */
   if (!cmd->uses_heap) {
--- a/src/asahi/vulkan/hk_descriptor_set_layout.c
+++ b/src/asahi/vulkan/hk_descriptor_set_layout.c
@ -330,6 +330,7 @@ hk_GetDescriptorSetLayoutSupport(
   uint64_t non_variable_size = 0;
   uint32_t variable_stride = 0;
   uint32_t variable_count = 0;
+   bool variable_is_inline_uniform_block = false;
   uint8_t dynamic_buffer_count = 0;

   for (uint32_t i = 0; i < pCreateInfo->bindingCount; i++) {
@ -362,6 +363,10 @@ hk_GetDescriptorSetLayoutSupport(
             */
            variable_count = MAX2(1, binding->descriptorCount);
            variable_stride = stride;
+
+            variable_is_inline_uniform_block =
+               binding->descriptorType ==
+               VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
         } else {
            /* Since we're aligning to the maximum and since this is just a
             * check for whether or not the max buffer size is big enough, we
@ -393,12 +398,21 @@ hk_GetDescriptorSetLayoutSupport(
      switch (ext->sType) {
      case VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT: {
         VkDescriptorSetVariableDescriptorCountLayoutSupport *vs = (void *)ext;
+         uint32_t max_var_count;
+
         if (variable_stride > 0) {
-            vs->maxVariableDescriptorCount =
+            max_var_count =
               (max_buffer_size - non_variable_size) / variable_stride;
         } else {
-            vs->maxVariableDescriptorCount = 0;
+            max_var_count = 0;
         }
+
+         if (variable_is_inline_uniform_block) {
+            max_var_count =
+               MIN2(max_var_count, HK_MAX_INLINE_UNIFORM_BLOCK_SIZE);
+         }
+
+         vs->maxVariableDescriptorCount = max_var_count;
         break;
      }

--- a/src/asahi/vulkan/hk_device.h
+++ b/src/asahi/vulkan/hk_device.h
@ -92,6 +92,7 @@ struct hk_device {
    * expected to be a legitimate problem. If it is, we can rework later.
    */
   struct agx_bo *heap;
+   util_once_flag heap_init_once;

   struct {
      struct agx_scratch vs, fs, cs;
--- a/src/asahi/vulkan/hk_image.c
+++ b/src/asahi/vulkan/hk_image.c
@ -1424,6 +1424,13 @@ hk_copy_memory_to_image(struct hk_device *device, struct hk_image *dst_image,
   uint32_t src_height = info->memoryImageHeight ?: extent.height;

   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+
+   /* Align width and height to block */
+   src_width =
+      DIV_ROUND_UP(src_width, util_format_get_blockwidth(layout->format));
+   src_height =
+      DIV_ROUND_UP(src_height, util_format_get_blockheight(layout->format));
+
   uint32_t src_pitch = src_width * blocksize_B;

   unsigned start_layer = (dst_image->vk.image_type == VK_IMAGE_TYPE_3D)
@ -1496,6 +1503,13 @@ hk_copy_image_to_memory(struct hk_device *device, struct hk_image *src_image,
 #endif

   uint32_t blocksize_B = util_format_get_blocksize(layout->format);
+
+   /* Align width and height to block */
+   dst_width =
+      DIV_ROUND_UP(dst_width, util_format_get_blockwidth(layout->format));
+   dst_height =
+      DIV_ROUND_UP(dst_height, util_format_get_blockheight(layout->format));
+
   uint32_t dst_pitch = dst_width * blocksize_B;

   unsigned start_layer = (src_image->vk.image_type == VK_IMAGE_TYPE_3D)
@ -1649,11 +1663,6 @@ hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
            &device->physical_device->ubwc_config);
 #endif
      } else {
-         /* Work tile-by-tile, holding the unswizzled tile in a temporary
-          * buffer.
-          */
-         char temp_tile[16384];
-
         unsigned src_level = info->srcSubresource.mipLevel;
         unsigned dst_level = info->dstSubresource.mipLevel;
         uint32_t block_width = src_layout->tilesize_el[src_level].width_el;
@ -1667,6 +1676,12 @@ hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
         }

         uint32_t temp_pitch = block_width * src_block_B;
+         size_t temp_tile_size = temp_pitch * (src_offset.y + extent.height);
+
+         /* Work tile-by-tile, holding the unswizzled tile in a temporary
+          * buffer.
+          */
+         char *temp_tile = malloc(temp_tile_size);

         for (unsigned by = src_offset.y / block_height;
              by * block_height < src_offset.y + extent.height; by++) {
@ -1683,14 +1698,14 @@ hk_copy_image_to_image_cpu(struct hk_device *device, struct hk_image *src_image,
                  MIN2((bx + 1) * block_width, src_offset.x + extent.width) -
                  src_x_start;

-               assert(height * temp_pitch <= ARRAY_SIZE(temp_tile));
-
               ail_detile((void *)src, temp_tile, src_layout, src_level,
                          temp_pitch, src_x_start, src_y_start, width, height);
               ail_tile(dst, temp_tile, dst_layout, dst_level, temp_pitch,
                        dst_x_start, dst_y_start, width, height);
            }
         }
+
+         free(temp_tile);
      }
   }
 }
--- a/src/asahi/vulkan/hk_physical_device.c
+++ b/src/asahi/vulkan/hk_physical_device.c
@ -859,7 +859,7 @@ hk_get_device_properties(const struct agx_device *dev,
      .maxSubgroupSize = 32,
      .maxComputeWorkgroupSubgroups = 1024 / 32,
      .requiredSubgroupSizeStages = 0,
-      .maxInlineUniformBlockSize = 1 << 16,
+      .maxInlineUniformBlockSize = HK_MAX_INLINE_UNIFORM_BLOCK_SIZE,
      .maxPerStageDescriptorInlineUniformBlocks = 32,
      .maxPerStageDescriptorUpdateAfterBindInlineUniformBlocks = 32,
      .maxDescriptorSetInlineUniformBlocks = 6 * 32,
@ -953,7 +953,7 @@ hk_get_device_properties(const struct agx_device *dev,
      .robustUniformBufferAccessSizeAlignment = HK_MIN_UBO_ALIGNMENT,

      /* VK_EXT_sample_locations */
-      .sampleLocationSampleCounts = sample_counts,
+      .sampleLocationSampleCounts = sample_counts & ~VK_SAMPLE_COUNT_1_BIT,
      .maxSampleLocationGridSize = (VkExtent2D){1, 1},
      .sampleLocationCoordinateRange[0] = 0.0f,
      .sampleLocationCoordinateRange[1] = 0.9375f,
--- a/src/asahi/vulkan/hk_private.h
+++ b/src/asahi/vulkan/hk_private.h
@ -12,18 +12,19 @@
 #include "vk_log.h"
 #include "vk_util.h"

-#define HK_MAX_SETS                   8
-#define HK_MAX_PUSH_SIZE              256
-#define HK_MAX_DYNAMIC_BUFFERS        64
-#define HK_MAX_RTS                    8
-#define HK_MIN_SSBO_ALIGNMENT         16
-#define HK_MIN_TEXEL_BUFFER_ALIGNMENT 16
-#define HK_MIN_UBO_ALIGNMENT          64
-#define HK_MAX_VIEWPORTS              16
-#define HK_MAX_DESCRIPTOR_SIZE        64
-#define HK_MAX_PUSH_DESCRIPTORS       32
-#define HK_MAX_DESCRIPTOR_SET_SIZE    (1u << 30)
-#define HK_MAX_DESCRIPTORS            (1 << 20)
+#define HK_MAX_SETS                      8
+#define HK_MAX_PUSH_SIZE                 256
+#define HK_MAX_DYNAMIC_BUFFERS           64
+#define HK_MAX_RTS                       8
+#define HK_MIN_SSBO_ALIGNMENT            16
+#define HK_MIN_TEXEL_BUFFER_ALIGNMENT    16
+#define HK_MIN_UBO_ALIGNMENT             64
+#define HK_MAX_VIEWPORTS                 16
+#define HK_MAX_DESCRIPTOR_SIZE           64
+#define HK_MAX_PUSH_DESCRIPTORS          32
+#define HK_MAX_DESCRIPTOR_SET_SIZE       (1u << 30)
+#define HK_MAX_INLINE_UNIFORM_BLOCK_SIZE (1u << 16)
+#define HK_MAX_DESCRIPTORS               (1 << 20)
 #define HK_PUSH_DESCRIPTOR_SET_SIZE                                            \
   (HK_MAX_PUSH_DESCRIPTORS * HK_MAX_DESCRIPTOR_SIZE)
 #define HK_SSBO_BOUNDS_CHECK_ALIGNMENT 4
--- a/src/asahi/vulkan/hk_queue.c
+++ b/src/asahi/vulkan/hk_queue.c
@ -812,11 +812,6 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue,
   /* Now setup the command structs */
   struct util_dynarray payload;
   util_dynarray_init(&payload, NULL);
-   union drm_asahi_cmd *cmds = malloc(sizeof(*cmds) * command_count);
-   if (cmds == NULL) {
-      free(cmds);
-      return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
-   }

   unsigned nr_vdm = 0, nr_cdm = 0;

--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -319,14 +319,10 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_load_base_global_invocation_id:
   case nir_intrinsic_load_base_workgroup_id:
   case nir_intrinsic_load_alpha_reference_amd:
-   case nir_intrinsic_load_ubo_uniform_block_intel:
-   case nir_intrinsic_load_ssbo_uniform_block_intel:
-   case nir_intrinsic_load_shared_uniform_block_intel:
   case nir_intrinsic_load_barycentric_optimize_amd:
   case nir_intrinsic_load_poly_line_smooth_enabled:
   case nir_intrinsic_load_rasterization_primitive_amd:
   case nir_intrinsic_unit_test_uniform_amd:
-   case nir_intrinsic_load_global_constant_uniform_block_intel:
   case nir_intrinsic_load_debug_log_desc_amd:
   case nir_intrinsic_load_xfb_state_address_gfx12_amd:
   case nir_intrinsic_cmat_length:
@ -364,6 +360,24 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
      is_divergent = false;
      break;

+   case nir_intrinsic_load_ubo_uniform_block_intel:
+   case nir_intrinsic_load_ssbo_uniform_block_intel:
+   case nir_intrinsic_load_shared_uniform_block_intel:
+   case nir_intrinsic_load_global_constant_uniform_block_intel:
+      if (options & (nir_divergence_across_subgroups |
+                     nir_divergence_multiple_workgroup_per_compute_subgroup)) {
+         unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
+         for (unsigned i = 0; i < num_srcs; i++) {
+            if (src_divergent(instr->src[i], state)) {
+               is_divergent = true;
+               break;
+            }
+         }
+      } else {
+         is_divergent = false;
+      }
+      break;
+
   /* This is divergent because it specifically loads sequential values into
    * successive SIMD lanes.
    */
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@ -1069,6 +1069,7 @@ nir_get_io_index_src_number(const nir_intrinsic_instr *instr)
   IMG_CASE(atomic):
   IMG_CASE(atomic_swap):
   IMG_CASE(size):
+   IMG_CASE(levels):
   IMG_CASE(samples):
   IMG_CASE(texel_address):
   IMG_CASE(samples_identical):
--- a/src/compiler/nir/nir_lower_shader_calls.c
+++ b/src/compiler/nir/nir_lower_shader_calls.c
@ -1228,8 +1228,16 @@ wrap_instr(nir_builder *b, nir_instr *instr, void *data)
 static bool
 wrap_instrs(nir_shader *shader, wrap_instr_callback callback)
 {
-   return nir_shader_instructions_pass(shader, wrap_instr,
-                                       nir_metadata_none, callback);
+   bool progress = nir_shader_instructions_pass(shader, wrap_instr,
+                                                nir_metadata_none, callback);
+   /* Wrapping jump instructions that are located inside ifs can break SSA
+    * invariants because the else block no longer dominates the merge block.
+    * Repair the SSA to make the validator happy again.
+    */
+   if (progress)
+      nir_repair_ssa(shader);
+
+   return progress;
 }

 static bool
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -4096,9 +4096,9 @@ distribute_src_mods = [
   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),

-   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
+   (('fneg', ('ffma(is_used_once,nsz)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
-   (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
+   (('fneg', ('fadd(is_used_once,nsz)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),

   # Note that fmin <-> fmax.  I don't think there is a way to distribute
   # fabs() into fmin or fmax.
--- a/src/compiler/nir/nir_opt_shrink_stores.c
+++ b/src/compiler/nir/nir_opt_shrink_stores.c
@ -82,7 +82,9 @@ opt_shrink_store_instr(nir_builder *b, nir_intrinsic_instr *instr, bool shrink_i

   /* Trim the num_components stored according to the write mask. */
   unsigned write_mask = nir_intrinsic_write_mask(instr);
-   unsigned last_bit = util_last_bit(write_mask);
+   /* Don't trim down to an invalid number of components, though. */
+   unsigned last_bit = nir_round_up_components(util_last_bit(write_mask));
+
   if (last_bit < instr->num_components) {
      nir_def *def = nir_trim_vector(b, instr->src[0].ssa, last_bit);
      nir_src_rewrite(&instr->src[0], def);
--- a/src/compiler/nir/nir_precompiled.h
+++ b/src/compiler/nir/nir_precompiled.h
@ -652,6 +652,7 @@ nir_precompiled_build_variant(const nir_function *libfunc,

   assert(libfunc->workgroup_size[0] != 0 && "must set workgroup size");

+   b.shader->info.workgroup_size_variable = false;
   b.shader->info.workgroup_size[0] = libfunc->workgroup_size[0];
   b.shader->info.workgroup_size[1] = libfunc->workgroup_size[1];
   b.shader->info.workgroup_size[2] = libfunc->workgroup_size[2];
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@ -506,8 +506,8 @@ vtn_pointer_dereference(struct vtn_builder *b,
            type = type->array_element;
         }
         tail = nir_build_deref_array(&b->nb, tail, arr_index);
+         tail->arr.in_bounds = deref_chain->in_bounds;
      }
-      tail->arr.in_bounds = deref_chain->in_bounds;

      access |= type->access;
   }
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@ -2300,6 +2300,17 @@ insert_live_out_moves(struct ra_ctx *ctx)
   insert_file_live_out_moves(ctx, &ctx->shared);
 }

+static bool
+has_merge_set_preferred_reg(struct ir3_register *reg)
+{
+   assert(reg->merge_set);
+   assert(reg->num != INVALID_REG);
+
+   return reg->merge_set->preferred_reg != (physreg_t)~0 &&
+          ra_reg_get_physreg(reg) ==
+             reg->merge_set->preferred_reg + reg->merge_set_offset;
+}
+
 static void
 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
 {
@ -2338,17 +2349,15 @@ handle_block(struct ra_ctx *ctx, struct ir3_block *block)
         struct ir3_register *dst = input->dsts[0];
         assert(dst->num != INVALID_REG);

-         physreg_t dst_start = ra_reg_get_physreg(dst);
         physreg_t dst_end;

-         if (dst->merge_set) {
+         if (dst->merge_set && has_merge_set_preferred_reg(dst)) {
            /* Take the whole merge set into account to prevent its range being
             * allocated for defs not part of the merge set.
             */
-            assert(dst_start >= dst->merge_set_offset);
-            dst_end = dst_start - dst->merge_set_offset + dst->merge_set->size;
+            dst_end = dst->merge_set->preferred_reg + dst->merge_set->size;
         } else {
-            dst_end = dst_start + reg_size(dst);
+            dst_end = ra_reg_get_physreg(dst) + reg_size(dst);
         }

         struct ra_file *file = ra_get_file(ctx, dst);
--- a/src/freedreno/vulkan/tu_clear_blit.cc
+++ b/src/freedreno/vulkan/tu_clear_blit.cc
@ -1461,6 +1461,15 @@ r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
      gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
   }

+   /* On a7xx we must always use FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8. See
+    * blit_base_format().
+    */
+   if (CHIP >= A7XX && att->format == VK_FORMAT_D24_UNORM_S8_UINT) {
+      RB_MRT_BUF_INFO = pkt_field_set(A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT,
+                                      RB_MRT_BUF_INFO,
+                                      FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
+   }
+
   tu_cs_emit_regs(cs,
                   RB_MRT_BUF_INFO(CHIP, 0, .dword = RB_MRT_BUF_INFO),
                   A6XX_RB_MRT_PITCH(0, 0),
@ -1533,7 +1542,8 @@ r3d_setup(struct tu_cmd_buffer *cmd,
      tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
   }

-   enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
+   enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, 
+                                                 blit_param & R3D_DST_GMEM);
   fixup_dst_format(src_format, &dst_format, &fmt);

   if (!cmd->state.pass) {
--- a/src/freedreno/vulkan/tu_descriptor_set.cc
+++ b/src/freedreno/vulkan/tu_descriptor_set.cc
@ -208,8 +208,8 @@ tu_CreateDescriptorSetLayout(
      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK)
         set_layout->has_inline_uniforms = true;

-      if (variable_flags && binding->binding < variable_flags->bindingCount &&
-          (variable_flags->pBindingFlags[binding->binding] &
+      if (variable_flags && j < variable_flags->bindingCount &&
+          (variable_flags->pBindingFlags[j] &
           VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
         assert(!binding->pImmutableSamplers); /* Terribly ill defined  how
                                                  many samplers are valid */
@ -377,7 +377,7 @@ tu_GetDescriptorSetLayoutSupport(
      uint64_t max_count = MAX_SET_SIZE;
      unsigned descriptor_count = binding->descriptorCount;
      if (binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK) {
-         max_count = MAX_SET_SIZE - size;
+         max_count = MAX_INLINE_UBO_RANGE - size;
         descriptor_count = descriptor_sz;
         descriptor_sz = 1;
      } else if (descriptor_sz) {
@ -388,9 +388,9 @@ tu_GetDescriptorSetLayoutSupport(
         supported = false;
      }

-      if (variable_flags && binding->binding < variable_flags->bindingCount &&
+      if (variable_flags && i < variable_flags->bindingCount &&
          variable_count &&
-          (variable_flags->pBindingFlags[binding->binding] &
+          (variable_flags->pBindingFlags[i] &
           VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT)) {
         variable_count->maxVariableDescriptorCount =
            MIN2(UINT32_MAX, max_count);
--- a/src/freedreno/vulkan/tu_pass.cc
+++ b/src/freedreno/vulkan/tu_pass.cc
@ -417,7 +417,8 @@ tu_render_pass_patch_input_gmem(struct tu_render_pass *pass)
         uint32_t a = subpass->input_attachments[j].attachment;
         if (a == VK_ATTACHMENT_UNUSED)
            continue;
-         subpass->input_attachments[j].patch_input_gmem = written[a];
+         subpass->input_attachments[j].patch_input_gmem =
+            written[a] && pass->attachments[a].gmem;
      }

      for (unsigned j = 0; j < subpass->color_count; j++) {
@ -1126,6 +1127,7 @@ tu_CreateRenderPass2(VkDevice _device,
      if (!att->gmem) {
         att->clear_mask = 0;
         att->load = false;
+         att->load_stencil = false;
      }
   }

--- a/src/freedreno/vulkan/tu_pipeline.cc
+++ b/src/freedreno/vulkan/tu_pipeline.cc
@ -3157,8 +3157,6 @@ tu6_emit_blend(struct tu_cs *cs,

   bool dual_src_blend = tu_blend_state_is_dual_src(cb);

-   tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_rts));
-   tu_cs_emit_regs(cs, A6XX_RB_PS_MRT_CNTL(.mrt = num_rts));
   tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
                                          .unk8 = true,
                                          .dual_color_in_enable =
@ -3180,10 +3178,12 @@ tu6_emit_blend(struct tu_cs *cs,
                                          .alpha_to_one = alpha_to_one_enable,
                                          .sample_mask = sample_mask));

+   unsigned num_remapped_rts = 0;
   for (unsigned i = 0; i < num_rts; i++) {
      if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
         continue;
      unsigned remapped_idx = cal->color_map[i];
+      num_remapped_rts = MAX2(num_remapped_rts, remapped_idx + 1);
      const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
      if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
         const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
@ -3227,6 +3227,8 @@ tu6_emit_blend(struct tu_cs *cs,
                            A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
      }
   }
+   tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_remapped_rts));
+   tu_cs_emit_regs(cs, A6XX_RB_PS_MRT_CNTL(.mrt = num_remapped_rts));
 }

 static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
--- a/src/gallium/drivers/iris/iris_program.c
+++ b/src/gallium/drivers/iris/iris_program.c
@ -223,9 +223,9 @@ iris_apply_brw_tes_prog_data(struct iris_compiled_shader *shader,

   iris_apply_brw_vue_prog_data(&brw->base, &iris->base);

-   iris->partitioning         = brw->partitioning;
-   iris->output_topology      = brw->output_topology;
-   iris->domain               = brw->domain;
+   iris->partitioning         = brw_tess_info_partitioning(brw->tess_info);
+   iris->output_topology      = brw_tess_info_output_topology(brw->tess_info);
+   iris->domain               = brw_tess_info_domain(brw->tess_info);
   iris->include_primitive_id = brw->include_primitive_id;
 }

--- a/src/gallium/drivers/r300/ci/r300-rs740-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rs740-fails.txt
@ -1236,44 +1236,6 @@ spec@nv_texture_env_combine4@nv_texture_env_combine4-combine,Fail

 spec@oes_texture_float@oes_texture_float half,Fail

-# Remaining fallout from 9d359c6d10adb1cd2978a0e13714a3f98544aae8
-spec@arb_texture_compression@fbo-generatemipmap-formats,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA NPOT,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@gen-compressed-teximage,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT NPOT,Fail
-
 # uprev Piglit in Mesa
 spec@!opengl 1.1@teximage-scale-bias,Fail
 spec@glsl-1.10@execution@glsl-fs-texture2d-mipmap-const-bias-01,Fail
--- a/src/gallium/drivers/r300/ci/r300-rv410-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rv410-fails.txt
@ -1280,44 +1280,6 @@ spec@nv_texture_env_combine4@nv_texture_env_combine4-combine,Fail

 spec@oes_texture_float@oes_texture_float half,Fail

-# Remaining fallout from 9d359c6d10adb1cd2978a0e13714a3f98544aae8
-spec@arb_texture_compression@fbo-generatemipmap-formats,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA NPOT,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@gen-compressed-teximage,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT NPOT,Fail
-
 # uprev Piglit in Mesa
 spec@!opengl 1.1@teximage-scale-bias,Fail
 spec@glsl-1.10@execution@glsl-fs-texture2d-mipmap-const-bias-01,Fail
--- a/src/gallium/drivers/r300/ci/r300-rv530-nohiz-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rv530-nohiz-fails.txt
@ -778,78 +778,6 @@ dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_nearest_repeat,Fail
 dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_nearest_mirror,Fail
 dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_nearest_clamp,Fail

-# Remaining fallout from 9d359c6d10adb1cd2978a0e13714a3f98544aae8
-spec@arb_texture_compression@fbo-generatemipmap-formats,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA NPOT,Fail
-spec@ati_texture_compression_3dc@fbo-generatemipmap-formats,Fail
-spec@ati_texture_compression_3dc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI,Fail
-spec@ati_texture_compression_3dc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_ALPHA_LATC2_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_ALPHA_LATC2_EXT NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_LATC1_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_LATC1_EXT NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_LATC2_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_LATC2_EXT NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_LATC1_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_LATC1_EXT NPOT,Fail
-spec@ext_texture_compression_rgtc@compressedteximage gl_compressed_red_green_rgtc2_ext,Fail
-spec@ext_texture_compression_rgtc@compressedteximage gl_compressed_signed_red_green_rgtc2_ext,Fail
-spec@ext_texture_compression_rgtc@compressedteximage gl_compressed_signed_red_rgtc1_ext,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RED_RGTC1,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RED_RGTC1 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RG_RGTC2,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RG_RGTC2 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED_RGTC1,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED_RGTC1 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG_RGTC2,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG_RGTC2 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG NPOT,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@gen-compressed-teximage,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT NPOT,Fail
-
 # uprev Piglit in Mesa
 spec@!opengl 1.1@teximage-scale-bias,Fail
 spec@ext_framebuffer_multisample@accuracy all_samples color depthstencil linear,Fail
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@ -947,6 +947,32 @@ r300_set_framebuffer_state(struct pipe_context* pipe,
    util_framebuffer_init(pipe, state, r300->fb_cbufs, &r300->fb_zsbuf);
    util_copy_framebuffer_state(r300->fb_state.state, state);

+    /* DXTC blits require that blocks are 2x1 or 4x1 pixels, but
+     * pipe_surface_width sets the framebuffer width as if blocks were 1x1
+     * pixels. Override the width to correct that.
+     */
+    if (state->nr_cbufs == 1 && state->cbufs[0].texture &&
+        state->cbufs[0].format == PIPE_FORMAT_R8G8B8A8_UNORM &&
+        util_format_is_compressed(state->cbufs[0].texture->format)) {
+        struct pipe_framebuffer_state *fb =
+            (struct pipe_framebuffer_state*)r300->fb_state.state;
+        const struct util_format_description *desc =
+            util_format_description(state->cbufs[0].texture->format);
+        unsigned width = u_minify(state->cbufs[0].texture->width0,
+                                  state->cbufs[0].level);
+
+        assert(desc->block.width == 4 && desc->block.height == 4);
+
+        /* Each 64-bit DXT block is 2x1 pixels, and each 128-bit DXT
+         * block is 4x1 pixels when blitting.
+         */
+        width = align(width, 4); /* align to the DXT block width. */
+        if (desc->block.bits == 64)
+           width = DIV_ROUND_UP(width, 2);
+
+        fb->width = width;
+    }
+
    /* Remove trailing NULL colorbuffers. */
    while (current_state->nr_cbufs && !current_state->cbufs[current_state->nr_cbufs-1].texture)
        current_state->nr_cbufs--;
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@ -201,6 +201,7 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	rctx->b.set_vertex_buffers(&rctx->b, 1, &vbuffer);
 	util_draw_arrays_instanced(&rctx->b, R600_PRIM_RECTANGLE_LIST, 0, 3,
 				   0, num_instances);
+	pipe_resource_reference(&buf, NULL);
 }

 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@ -14,6 +14,7 @@
 #include "util/u_memory.h"
 #include "util/u_pack_color.h"
 #include "util/u_surface.h"
+#include "util/u_resource.h"
 #include "util/os_time.h"
 #include "frontend/winsys_handle.h"
 #include <errno.h>
@ -442,7 +443,7 @@ static bool r600_texture_get_param(struct pipe_screen *screen,

 	switch (param) {
 	case PIPE_RESOURCE_PARAM_NPLANES:
-		*value = 1;
+		*value = util_resource_num(resource);
 		return true;

 	case PIPE_RESOURCE_PARAM_STRIDE:
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@ -56,6 +56,19 @@ static bool si_update_shaders(struct si_context *sctx)
   struct si_shader *old_ps = sctx->shader.ps.current;
   int r;

+   if (GFX_VERSION >= GFX9) {
+      /* For merged shaders, mark the next shader as dirty so its previous_stage is updated. */
+      if (is_vs_state_changed) {
+         if (HAS_TESS) {
+            is_tess_state_changed = true;
+         } else if (HAS_GS) {
+            is_gs_state_changed = true;
+         }
+      }
+      if ((sctx->dirty_shaders_mask & BITFIELD_BIT(MESA_SHADER_TESS_EVAL)) && HAS_GS && HAS_TESS)
+         is_gs_state_changed = true;
+   }
+
   /* Update TCS and TES. */
   if (HAS_TESS && is_tess_state_changed) {
      if (!sctx->has_tessellation) {
--- a/src/gallium/drivers/v3d/v3d_blit.c
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@ -690,6 +690,7 @@ v3d_get_sand8_fs(struct pipe_context *pctx, int cpp)
                nir_variable_create(b.shader, nir_var_shader_out,
                                    vec4, "f_color");
        color_out->data.location = FRAG_RESULT_COLOR;
+        b.shader->info.outputs_written |= BITFIELD_BIT(FRAG_RESULT_COLOR);

        nir_variable *pos_in =
                nir_variable_create(b.shader, nir_var_shader_in, vec4, "pos");
@ -998,6 +999,7 @@ v3d_get_sand30_fs(struct pipe_context *pctx)
                                                      nir_var_shader_out,
                                                      glsl_uvec4, "f_color");
        color_out->data.location = FRAG_RESULT_COLOR;
+        b.shader->info.outputs_written |= BITFIELD_BIT(FRAG_RESULT_COLOR);

        nir_variable *pos_in =
                nir_variable_create(b.shader, nir_var_shader_in, vec4, "pos");
--- a/src/gallium/drivers/zink/ci/zink-anv-adl-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-anv-adl-validation-settings.txt
@ -1,6 +1,6 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiIndexedEXT-format-07753,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-vkDestroyDevice-device-05137,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiIndexedEXT-format-07753,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-vkDestroyDevice-device-05137,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-08879
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-anv-cml-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-anv-cml-validation-settings.txt
@ -1,6 +1,6 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-vkCmdDrawMultiIndexedEXT-None-10909
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-anv-tgl-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-anv-tgl-validation-settings.txt
@ -1,6 +1,6 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiIndexedEXT-format-07753,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-vkDestroyDevice-device-05137,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiIndexedEXT-format-07753,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-vkDestroyDevice-device-05137
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-lavapipe-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-lavapipe-validation-settings.txt
@ -1,6 +1,6 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiEXT-None-02699,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdPipelineBarrier2-shaderTileImageColorReadAccess-08718,VUID-VkGraphicsPipelineCreateInfo-flags-06482,VUID-vkCmdPipelineBarrier2-None-08719,VUID-vkCmdDrawMultiEXT-rasterizationSamples-07474,VUID-vkDestroyDevice-device-05137,VUID-VkRectLayerKHR-offset-04864,VUID-vkAcquireNextImageKHR-semaphore-01779,VUID-vkQueueSubmit-pSignalSemaphores-00067,VUID-VkImageMemoryBarrier2-srcAccessMask-07454,UNASSIGNED-GeneralParameterError-RequiredHandle,VUID-VkImageMemoryBarrier2-image-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-vkCmdDrawMultiEXT-None-02699,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-vkCmdPipelineBarrier2-shaderTileImageColorReadAccess-08718,VUID-VkGraphicsPipelineCreateInfo-flags-06482,VUID-vkCmdPipelineBarrier2-None-08719,VUID-vkCmdDrawMultiEXT-rasterizationSamples-07474,VUID-vkDestroyDevice-device-05137,VUID-VkRectLayerKHR-offset-04864,VUID-vkAcquireNextImageKHR-semaphore-01779,VUID-vkQueueSubmit-pSignalSemaphores-00067,VUID-VkImageMemoryBarrier2-srcAccessMask-07454,UNASSIGNED-GeneralParameterError-RequiredHandle,VUID-VkImageMemoryBarrier2-image-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-radv-cezanne-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-radv-cezanne-validation-settings.txt
@ -1,7 +1,7 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
 #khronos_validation.message_id_filter = VUID-RuntimeSpirv-Location-06272,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiEXT-None-08879,VUID-vkCmdDrawMultiIndexedEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-radv-navi31-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-radv-navi31-validation-settings.txt
@ -1,7 +1,7 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
 #khronos_validation.message_id_filter = VUID-RuntimeSpirv-Location-06272,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiEXT-None-08879,VUID-vkCmdDrawMultiIndexedEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-VkRenderingAttachmentInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-VkRenderingAttachmentInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/ci/zink-radv-vangogh-validation-settings.txt
+++ b/src/gallium/drivers/zink/ci/zink-radv-vangogh-validation-settings.txt
@ -1,7 +1,7 @@
 # Please include a comment with the log message and a testcase triggering each
 # VUID at the bottom of the file.
 #khronos_validation.message_id_filter = VUID-RuntimeSpirv-Location-06272,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiEXT-None-08879,VUID-vkCmdDrawMultiIndexedEXT-dynamicPrimitiveTopologyUnrestricted-07500,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737
-khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart,VUID-VkPhysicalDeviceMeshShaderFeaturesEXT-primitiveFragmentShadingRateMeshShader-07033
+khronos_validation.message_id_filter = VUID-VkPhysicalDeviceProperties2-pNext-pNext,VUID-VkDeviceCreateInfo-pNext-pNext,VUID-RuntimeSpirv-Location-06272,VUID-RuntimeSpirv-OpEntryPoint-08743,VUID-StandaloneSpirv-OpEntryPoint-08721,VUID-vkCmdDrawMultiEXT-dynamicRenderingUnusedAttachments-08911,VUID-vkCmdDrawMultiIndexedEXT-None-08879,VUID-vkDestroyDevice-device-05137,VUID-vkQueueSubmit-pCommandBuffers-00065,VUID-VkShaderCreateInfoEXT-pCode-08737,VUID-vkCmdDrawMultiEXT-None-08879,VUID-VkShaderCreateInfoEXT-pSetLayouts-parameter,VUID-vkCmdDrawMultiIndexedEXT-None-10909,UNASSIGNED-Draw-primitiveTopologyPatchListRestart
 khronos_validation.report_flags = error
 khronos_validation.debug_action = VK_DBG_LAYER_ACTION_LOG_MSG,VK_DBG_LAYER_ACTION_BREAK
 VK_LAYER_ENABLES=VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@ -3368,9 +3368,8 @@ begin_rendering(struct zink_context *ctx, bool check_msaa_expand)
      VK_TRUE,
      ctx->gfx_pipeline_state.rast_samples + 1,
   };
+   ctx->dynamic_fb.info.pNext = ctx->transient_attachments && !ctx->blitting && has_msrtss ? &msrtss : NULL;

-   if (has_msrtss && !ctx->blitting)
-      ctx->dynamic_fb.info.pNext = ctx->transient_attachments ? &msrtss : NULL;
   VKCTX(CmdBeginRendering)(ctx->bs->cmdbuf, &ctx->dynamic_fb.info);
   ctx->in_rp = true;
   return clear_buffers;
--- a/src/gallium/drivers/zink/zink_query.c
+++ b/src/gallium/drivers/zink/zink_query.c
@ -119,7 +119,9 @@ pipeline_statistic_convert(enum pipe_statistics_query_index idx)
      [PIPE_STAT_QUERY_PS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT,
      [PIPE_STAT_QUERY_HS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT,
      [PIPE_STAT_QUERY_DS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT,
-      [PIPE_STAT_QUERY_CS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT
+      [PIPE_STAT_QUERY_CS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT,
+      [PIPE_STAT_QUERY_MS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT,
+      [PIPE_STAT_QUERY_TS_INVOCATIONS] = VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT,
   };
   assert(idx < ARRAY_SIZE(map));
   return map[idx];
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@ -3133,6 +3133,12 @@ init_driver_workarounds(struct zink_screen *screen)
      screen->info.have_EXT_host_image_copy = false;
 }

+static void
+disable_features(struct zink_screen *screen)
+{
+   screen->info.mesh_feats.primitiveFragmentShadingRateMeshShader = false;
+}
+
 static void
 check_hic_shader_read(struct zink_screen *screen)
 {
@ -3513,6 +3519,7 @@ zink_internal_create_screen(const struct pipe_screen_config *config, int64_t dev
   check_hic_shader_read(screen);

   init_driver_workarounds(screen);
+   disable_features(screen);

   screen->dev = zink_create_logical_device(screen);
   if (!screen->dev)
--- a/src/gallium/winsys/radeon/drm/meson.build
+++ b/src/gallium/winsys/radeon/drm/meson.build
@ -1,6 +1,14 @@
 # Copyright © 2017 Dylan Baker
 # SPDX-License-Identifier: MIT

+libradeonwinsys_deps = [idep_mesautil, dep_libdrm]
+libradeonwinsys_c_args = []
+
+if with_gallium_radeonsi
+  libradeonwinsys_deps += [idep_amdgfxregs_h]
+  libradeonwinsys_c_args = ['-DHAVE_GALLIUM_RADEONSI']
+endif
+
 libradeonwinsys = static_library(
  'radeonwinsys',
  files('radeon_drm_bo.c',
@ -14,5 +22,6 @@ libradeonwinsys = static_library(
        'radeon_surface.h'),
  include_directories : [inc_src, inc_include, inc_gallium, inc_gallium_aux],
  gnu_symbol_visibility : 'hidden',
-  dependencies : [idep_mesautil, dep_libdrm],
+  c_args : libradeonwinsys_c_args,
+  dependencies : libradeonwinsys_deps,
 )
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@ -8,6 +8,10 @@
 #include "radeon_drm_bo.h"
 #include "radeon_drm_cs.h"

+#ifdef HAVE_GALLIUM_RADEONSI
+#include "amdgfxregs.h"
+#endif
+
 #include "util/os_file.h"
 #include "util/simple_mtx.h"
 #include "util/thread_sched.h"
@ -105,6 +109,73 @@ static bool radeon_get_drm_value(int fd, unsigned request,
   return true;
 }

+static void get_hs_info(struct radeon_info *info)
+{
+   /* This is the size of all TCS outputs in memory per workgroup.
+    * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K.
+    */
+   unsigned max_hs_out_vram_dwords_per_wg = info->family == CHIP_HAWAII ? 4096 : 8192;
+   unsigned max_workgroups_per_se;
+
+#ifdef HAVE_GALLIUM_RADEONSI /* for gfx6+ register definitions */
+   unsigned max_hs_out_vram_dwords_enum = 0;
+
+   switch (max_hs_out_vram_dwords_per_wg) {
+   case 8192:
+      max_hs_out_vram_dwords_enum = V_03093C_X_8K_DWORDS;
+      break;
+   case 4096:
+      max_hs_out_vram_dwords_enum = V_03093C_X_4K_DWORDS;
+      break;
+   case 2048:
+      max_hs_out_vram_dwords_enum = V_03093C_X_2K_DWORDS;
+      break;
+   case 1024:
+      max_hs_out_vram_dwords_enum = V_03093C_X_1K_DWORDS;
+      break;
+   default:
+      UNREACHABLE("invalid TCS workgroup size");
+   }
+#endif
+
+   /* Gfx7 should limit num_workgroups to 508 (127 per SE)
+    * Gfx6 should limit num_workgroups to 126 (63 per SE)
+    */
+   if (info->gfx_level == GFX7) {
+      max_workgroups_per_se = 127;
+   } else {
+      max_workgroups_per_se = 63;
+   }
+
+   /* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB.
+    * Note that the offchip allocation isn't deallocated until the corresponding TES waves finish.
+    */
+   unsigned num_offchip_wg_per_cu = 4;
+   unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa *
+                                         info->max_sa_per_se, max_workgroups_per_se);
+   unsigned num_workgroups = num_workgroups_per_se * info->max_se;
+
+#ifdef HAVE_GALLIUM_RADEONSI /* for gfx6+ register definitions */
+   if (info->gfx_level == GFX7) {
+      info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups) |
+                               S_03093C_OFFCHIP_GRANULARITY_GFX7(max_hs_out_vram_dwords_enum);
+   } else {
+      info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) |
+                               S_0089B0_OFFCHIP_GRANULARITY(max_hs_out_vram_dwords_enum);
+   }
+#endif
+
+   /* The typical size of tess factors of 1 TCS workgroup if all patches are triangles. */
+   unsigned typical_tess_factor_size_per_wg = (192 / 3) * 16;
+   unsigned num_tess_factor_wg_per_cu = 3;
+
+   info->hs_offchip_workgroup_dw_size = max_hs_out_vram_dwords_per_wg;
+   info->tess_offchip_ring_size = num_workgroups * max_hs_out_vram_dwords_per_wg * 4;
+   info->tess_factor_ring_size = typical_tess_factor_size_per_wg * num_tess_factor_wg_per_cu *
+                                 info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se;
+   info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size;
+}
+
 /* Helper function to do the ioctls needed for setup and init. */
 static bool do_winsys_init(struct radeon_drm_winsys *ws)
 {
@ -639,6 +710,9 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
   default:;
   }

+   if (ws->gen == DRV_SI)
+      get_hs_info(&ws->info);
+
   ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL ||
                                                                            strstr(debug_get_option("AMD_DEBUG", ""), "check_vm") != NULL;
   ws->noop_cs = debug_get_bool_option("RADEON_NOOP", false);
--- a/src/intel/compiler/brw/brw_compile_tcs.cpp
+++ b/src/intel/compiler/brw/brw_compile_tcs.cpp
@ -196,6 +196,9 @@ brw_compile_tcs(const struct brw_compiler *compiler,

   brw_prog_data_init(&prog_data->base.base, &params->base);

+   brw_fill_tess_info_from_shader_info(&prog_data->tess_info,
+                                       &nir->info);
+
   nir->info.outputs_written = key->outputs_written;
   nir->info.patch_outputs_written = key->patch_outputs_written;

@ -221,6 +224,7 @@ brw_compile_tcs(const struct brw_compiler *compiler,
      BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);

   prog_data->input_vertices = key->input_vertices;
+   prog_data->output_vertices = nir->info.tess.tcs_vertices_out;
   prog_data->patch_count_threshold = get_patch_count_threshold(key->input_vertices);

   if (compiler->use_tcs_multi_patch) {
--- a/src/intel/compiler/brw/brw_compile_tes.cpp
+++ b/src/intel/compiler/brw/brw_compile_tes.cpp
@ -59,6 +59,22 @@ run_tes(brw_shader &s)
   return !s.failed;
 }

+extern "C" void
+brw_fill_tess_info_from_shader_info(struct brw_tess_info *brw_info,
+                                    const shader_info *shader_info)
+{
+   STATIC_ASSERT(INTEL_TESS_PARTITIONING_INTEGER == TESS_SPACING_EQUAL - 1);
+   STATIC_ASSERT(INTEL_TESS_PARTITIONING_ODD_FRACTIONAL ==
+                 TESS_SPACING_FRACTIONAL_ODD - 1);
+   STATIC_ASSERT(INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL ==
+                 TESS_SPACING_FRACTIONAL_EVEN - 1);
+
+   brw_info->primitive_mode = shader_info->tess._primitive_mode;
+   brw_info->spacing = shader_info->tess.spacing;
+   brw_info->ccw = shader_info->tess.ccw;
+   brw_info->point_mode = shader_info->tess.point_mode;
+}
+
 const unsigned *
 brw_compile_tes(const struct brw_compiler *compiler,
                brw_compile_tes_params *params)
@ -121,39 +137,8 @@ brw_compile_tes(const struct brw_compiler *compiler,

   prog_data->base.urb_read_length = 0;

-   STATIC_ASSERT(INTEL_TESS_PARTITIONING_INTEGER == TESS_SPACING_EQUAL - 1);
-   STATIC_ASSERT(INTEL_TESS_PARTITIONING_ODD_FRACTIONAL ==
-                 TESS_SPACING_FRACTIONAL_ODD - 1);
-   STATIC_ASSERT(INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL ==
-                 TESS_SPACING_FRACTIONAL_EVEN - 1);
-
-   prog_data->partitioning =
-      (enum intel_tess_partitioning) (nir->info.tess.spacing - 1);
-
-   switch (nir->info.tess._primitive_mode) {
-   case TESS_PRIMITIVE_QUADS:
-      prog_data->domain = INTEL_TESS_DOMAIN_QUAD;
-      break;
-   case TESS_PRIMITIVE_TRIANGLES:
-      prog_data->domain = INTEL_TESS_DOMAIN_TRI;
-      break;
-   case TESS_PRIMITIVE_ISOLINES:
-      prog_data->domain = INTEL_TESS_DOMAIN_ISOLINE;
-      break;
-   default:
-      UNREACHABLE("invalid domain shader primitive mode");
-   }
-
-   if (nir->info.tess.point_mode) {
-      prog_data->output_topology = INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
-   } else if (nir->info.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
-      prog_data->output_topology = INTEL_TESS_OUTPUT_TOPOLOGY_LINE;
-   } else {
-      /* Hardware winding order is backwards from OpenGL */
-      prog_data->output_topology =
-         nir->info.tess.ccw ? INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW
-                             : INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW;
-   }
+   brw_fill_tess_info_from_shader_info(&prog_data->tess_info,
+                                       &nir->info);

   if (unlikely(debug_enabled)) {
      fprintf(stderr, "TES Input ");
--- a/src/intel/compiler/brw/brw_compiler.h
+++ b/src/intel/compiler/brw/brw_compiler.h
@ -1158,13 +1158,26 @@ struct brw_vs_prog_data {
   uint32_t vf_component_packing[4];
 };

+struct brw_tess_info {
+   enum tess_primitive_mode primitive_mode:8;
+   uint8_t                         spacing:2;
+   bool                                ccw:1;
+   bool                         point_mode:1;
+   uint32_t                            pad:20;
+};
+
 struct brw_tcs_prog_data
 {
   struct brw_vue_prog_data base;

+   struct brw_tess_info tess_info;
+
   /** Number of input vertices, 0 means dynamic */
   unsigned input_vertices;

+   /** Number of output vertices */
+   unsigned output_vertices;
+
   /** Should the non-SINGLE_PATCH payload provide primitive ID? */
   bool include_primitive_id;

@ -1187,14 +1200,12 @@ struct brw_tcs_prog_data
   unsigned tess_config_param;
 };

-
 struct brw_tes_prog_data
 {
   struct brw_vue_prog_data base;

-   enum intel_tess_partitioning partitioning;
-   enum intel_tess_output_topology output_topology;
-   enum intel_tess_domain domain;
+   struct brw_tess_info tess_info;
+
   bool include_primitive_id;

   /**
@ -1367,6 +1378,64 @@ DEFINE_PROG_DATA_DOWNCAST(mesh, prog_data->stage == MESA_SHADER_MESH)

 #undef DEFINE_PROG_DATA_DOWNCAST

+static inline struct brw_tess_info
+brw_merge_tess_info(struct brw_tess_info tcs_info,
+                    struct brw_tess_info tes_info)
+{
+   /* Just merge by OR'ing the raw bits */
+   uint32_t x, y;
+
+   assert(sizeof(x) == sizeof(tcs_info));
+
+   memcpy(&x, &tcs_info, sizeof(x));
+   memcpy(&y, &tes_info, sizeof(y));
+
+   x |= y;
+
+   struct brw_tess_info out;
+   memcpy(&out, &x, sizeof(out));
+   return out;
+}
+
+static inline enum intel_tess_partitioning
+brw_tess_info_partitioning(struct brw_tess_info info)
+{
+   return (enum intel_tess_partitioning)(info.spacing - 1);
+}
+
+static inline enum intel_tess_domain
+brw_tess_info_domain(struct brw_tess_info info)
+{
+   switch (info.primitive_mode) {
+   case TESS_PRIMITIVE_QUADS:
+      return INTEL_TESS_DOMAIN_QUAD;
+      break;
+   case TESS_PRIMITIVE_TRIANGLES:
+      return INTEL_TESS_DOMAIN_TRI;
+      break;
+   case TESS_PRIMITIVE_ISOLINES:
+      return INTEL_TESS_DOMAIN_ISOLINE;
+      break;
+   default:
+      UNREACHABLE("invalid primitive mode");
+   }
+}
+
+static inline enum intel_tess_output_topology
+brw_tess_info_output_topology(struct brw_tess_info info)
+{
+   if (info.point_mode) {
+      return INTEL_TESS_OUTPUT_TOPOLOGY_POINT;
+   } else if (info.primitive_mode == TESS_PRIMITIVE_ISOLINES) {
+      return INTEL_TESS_OUTPUT_TOPOLOGY_LINE;
+   } else {
+      /* Hardware winding order is backwards from OpenGL */
+      return info.ccw ?
+         INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW :
+         INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW;
+   }
+}
+
 /** @} */

 struct brw_compiler *
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@ -2126,6 +2126,7 @@ flag_fused_eu_disable_instr(nir_builder *b, nir_instr *instr, void *data)
      case nir_intrinsic_bindless_image_atomic:
      case nir_intrinsic_bindless_image_atomic_swap: {
         int src_idx = nir_get_io_index_src_number(intrin);
+         assert(src_idx >= 0);
         if (nir_src_is_divergent(&intrin->src[src_idx])) {
            nir_intrinsic_set_access(intrin,
                                     nir_intrinsic_access(intrin) |
@ -2181,7 +2182,11 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler,
   if (OPT(nir_lower_tex, &tex_options))
      OPT(nir_lower_tex, &tex_options);

-   OPT(brw_nir_lower_mcs_fetch, devinfo);
+   /* MCS lowering can introduce u2u16 conversions. We need to lower those to
+    * make constant offsets detectable by brw_nir_texture_backend_opcode().
+    */
+   if (OPT(brw_nir_lower_mcs_fetch, devinfo))
+      OPT(nir_opt_constant_folding);

   const struct brw_nir_lower_texture_opts brw_tex_options = {
      .combined_lod_and_array_index = compiler->devinfo->ver >= 20,
--- a/src/intel/compiler/brw/brw_nir.h
+++ b/src/intel/compiler/brw/brw_nir.h
@ -36,6 +36,10 @@ extern "C" {

 extern const struct nir_shader_compiler_options brw_scalar_nir_options;

+void
+brw_fill_tess_info_from_shader_info(struct brw_tess_info *brw_info,
+                                    const shader_info *shader_info);
+
 int type_size_vec4(const struct glsl_type *type, bool bindless);
 int type_size_dvec4(const struct glsl_type *type, bool bindless);

--- a/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/brw/brw_nir_lower_ray_queries.c
@ -235,6 +235,7 @@ lower_ray_query_intrinsic(nir_builder *b,
      brw_nir_rt_sync_stack_addr(b, state->globals.base_mem_addr,
                                 state->globals.num_dss_rt_stacks);
   nir_def *stack_addr = shadow_stack_addr ? shadow_stack_addr : hw_stack_addr;
+   mesa_shader_stage stage = b->shader->info.stage;

   switch (intrin->intrinsic) {
   case nir_intrinsic_rq_initialize: {
@ -447,11 +448,27 @@ lower_ray_query_intrinsic(nir_builder *b,
         break;

      case nir_ray_query_value_intersection_object_ray_direction:
-         sysval = world_ray_in.dir;
+         if (stage == MESA_SHADER_CLOSEST_HIT) {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr,
+                                              state->devinfo);
+            sysval = brw_nir_build_vec3_mat_mult_col_major(
+               b, world_ray_in.dir, leaf.world_to_object, false);
+         } else {
+            sysval = object_ray_in.dir;
+         }
         break;

      case nir_ray_query_value_intersection_object_ray_origin:
-         sysval = world_ray_in.orig;
+         if (stage == MESA_SHADER_CLOSEST_HIT) {
+            struct brw_nir_rt_bvh_instance_leaf_defs leaf;
+            brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr,
+                                              state->devinfo);
+            sysval = brw_nir_build_vec3_mat_mult_col_major(
+               b, world_ray_in.orig, leaf.world_to_object, true);
+         } else {
+            sysval = object_ray_in.orig;
+         }
         break;

      case nir_ray_query_value_intersection_object_to_world: {
--- a/src/intel/compiler/brw/brw_nir_lower_rt_intrinsics.c
+++ b/src/intel/compiler/brw/brw_nir_lower_rt_intrinsics.c
@ -24,24 +24,6 @@
 #include "brw_nir_rt.h"
 #include "brw_nir_rt_builder.h"

-static nir_def *
-nir_build_vec3_mat_mult_col_major(nir_builder *b, nir_def *vec,
-                                  nir_def *matrix[], bool translation)
-{
-   nir_def *result_components[3] = {
-      nir_channel(b, matrix[3], 0),
-      nir_channel(b, matrix[3], 1),
-      nir_channel(b, matrix[3], 2),
-   };
-   for (unsigned i = 0; i < 3; ++i) {
-      for (unsigned j = 0; j < 3; ++j) {
-         nir_def *v = nir_fmul(b, nir_channels(b, vec, 1 << j), nir_channels(b, matrix[j], 1 << i));
-         result_components[i] = (translation || j) ? nir_fadd(b, result_components[i], v) : v;
-      }
-   }
-   return nir_vec(b, result_components, 3);
-}
-
 static nir_def *
 build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
 {
@ -193,7 +175,7 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
               brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr,
                                                 devinfo);

-               sysval = nir_build_vec3_mat_mult_col_major(
+               sysval = brw_nir_build_vec3_mat_mult_col_major(
                  b, world_ray_in.orig, leaf.world_to_object, true);
            } else {
               sysval = object_ray_in.orig;
@ -206,7 +188,7 @@ lower_rt_intrinsics_impl(nir_function_impl *impl,
               brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr,
                                                 devinfo);

-               sysval = nir_build_vec3_mat_mult_col_major(
+               sysval = brw_nir_build_vec3_mat_mult_col_major(
                  b, world_ray_in.dir, leaf.world_to_object, false);
            } else {
               sysval = object_ray_in.dir;
--- a/src/intel/compiler/brw/brw_nir_rt.h
+++ b/src/intel/compiler/brw/brw_nir_rt.h
@ -84,6 +84,24 @@ nir_shader *
 brw_nir_create_null_ahs_shader(const struct brw_compiler *compiler,
                               void *mem_ctx);

+static inline nir_def *
+brw_nir_build_vec3_mat_mult_col_major(nir_builder *b, nir_def *vec,
+                                      nir_def *matrix[], bool translation)
+{
+   nir_def *result_components[3] = {
+      nir_channel(b, matrix[3], 0),
+      nir_channel(b, matrix[3], 1),
+      nir_channel(b, matrix[3], 2),
+   };
+   for (unsigned i = 0; i < 3; ++i) {
+      for (unsigned j = 0; j < 3; ++j) {
+         nir_def *v = nir_fmul(b, nir_channels(b, vec, 1 << j), nir_channels(b, matrix[j], 1 << i));
+         result_components[i] = (translation || j) ? nir_fadd(b, result_components[i], v) : v;
+      }
+   }
+   return nir_vec(b, result_components, 3);
+}
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/intel/compiler/brw/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw/brw_schedule_instructions.cpp
@ -1128,8 +1128,26 @@ has_cross_lane_access(const brw_inst *inst)

   for (unsigned s = 0; s < inst->sources; s++) {
      if (inst->src[s].file == VGRF) {
-         if (inst->src[s].stride == 0)
+         /* The instruction reads a particular lane (only relevant with non
+          * scalar values, otherwise this is just the way we read uniform
+          * values produced in reduced SIMD size).
+          */
+         if (!inst->src[s].is_scalar && inst->src[s].stride == 0)
            return true;
+      } else if (inst->src[s].file == ARF &&
+                 inst->src[s].nr >= BRW_ARF_FLAG &&
+                 inst->src[s].nr < BRW_ARF_MASK) {
+         /* The instruction reads the flag register which represents states
+          * from all the lanes.
+          *
+          * Note that although this prevents moving instructions reading the
+          * flag registers past a HALT kind of instruction, this doesn't
+          * prevent the instructions that generated the flag value from moving
+          * on either side of the HALT instruction. So it's possible for
+          * ballot instructions to produce incorrect values when used in a
+          * shader with HALT.
+          */
+         return true;
      }
   }

--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@ -1387,6 +1387,10 @@ VkResult anv_CreateDescriptorPool(
   return VK_SUCCESS;
 }

+static void anv_descriptor_set_destroy(struct anv_device *device,
+                                       struct anv_descriptor_pool *pool,
+                                       struct anv_descriptor_set *set);
+
 void anv_DestroyDescriptorPool(
    VkDevice                                    _device,
    VkDescriptorPool                            _pool,
@ -1400,9 +1404,16 @@ void anv_DestroyDescriptorPool(

   ANV_RMV(resource_destroy, device, pool);

+   /* From the Vulkan spec, vkDestroyDescriptorPool:
+    *
+    *    "When a pool is destroyed, all descriptor sets allocated from the
+    *     pool are implicitly freed and become invalid. Descriptor sets
+    *     allocated from a given pool do not need to be freed before destroying
+    *     that descriptor pool."
+    */
   list_for_each_entry_safe(struct anv_descriptor_set, set,
                            &pool->desc_sets, pool_link) {
-      vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk);
+      anv_descriptor_set_destroy(device, pool, set);
   }

   util_vma_heap_finish(&pool->host_heap);
--- a/src/intel/vulkan/anv_image_host_copy.c
+++ b/src/intel/vulkan/anv_image_host_copy.c
@ -480,6 +480,9 @@ anv_CopyImageToMemory(
   return VK_SUCCESS;
 }

+/* This functions copies from one image to another through an intermediate
+ * linear buffer.
+ */
 static void
 copy_image_to_image(struct anv_device *device,
                    struct anv_image *src_image,
@ -505,14 +508,11 @@ copy_image_to_image(struct anv_device *device,
   isl_surf_get_tile_info(src_surf, &src_tile);
   isl_surf_get_tile_info(dst_surf, &dst_tile);

-   uint32_t tile_width_B;
   uint32_t tile_width_el, tile_height_el;
   if (src_tile.phys_extent_B.w > dst_tile.phys_extent_B.w) {
-      tile_width_B   = src_tile.phys_extent_B.w;
      tile_width_el  = src_tile.logical_extent_el.w;
      tile_height_el = src_tile.logical_extent_el.h;
   } else {
-      tile_width_B   = dst_tile.phys_extent_B.w;
      tile_width_el  = dst_tile.logical_extent_el.w;
      tile_height_el = dst_tile.logical_extent_el.h;
   }
@ -527,14 +527,18 @@ copy_image_to_image(struct anv_device *device,
   VkExtent3D extent_el =
      vk_extent3d_to_el(src_surf->format, region->extent);

+   uint32_t linear_stride_B;
   /* linear-to-linear case */
   if (tile_width_el == 1 && tile_height_el == 1) {
      tile_width_el = MIN2(4096 / (src_tile.format_bpb / 8),
                           extent_el.width);
      tile_height_el = 4096 / (tile_width_el * (src_tile.format_bpb / 8));
-      tile_width_B = tile_width_el * src_tile.format_bpb / 8;
+      linear_stride_B = tile_width_el * src_tile.format_bpb / 8;
+   } else {
+      linear_stride_B = src_tile.logical_extent_el.w * src_tile.format_bpb / 8;
   }

+
   uint32_t layer_count =
      vk_image_subresource_layer_count(&src_image->vk, &region->srcSubresource);
   for (uint32_t a = 0; a < layer_count; a++) {
@ -559,7 +563,7 @@ copy_image_to_image(struct anv_device *device,
                                     src_binding,
                                     src_anv_surf->memory_range.offset,
                                     tmp_map,
-                                     tile_width_B,
+                                     linear_stride_B,
                                     &src_offset, &extent,
                                     region->srcSubresource.mipLevel,
                                     region->srcSubresource.baseArrayLayer,
@ -570,7 +574,7 @@ copy_image_to_image(struct anv_device *device,
                                     dst_binding,
                                     dst_anv_surf->memory_range.offset,
                                     tmp_map,
-                                     tile_width_B,
+                                     linear_stride_B,
                                     &dst_offset, &extent,
                                     region->dstSubresource.mipLevel,
                                     region->dstSubresource.baseArrayLayer,
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@ -2161,8 +2161,15 @@ struct anv_gfx_dynamic_state {
      uint32_t SampleMask;
   } sm;

+   /* 3DSTATE_DS */
+   struct {
+      bool ComputeWCoordinateEnable;
+   } ds;
+
   /* 3DSTATE_TE */
   struct {
+      uint32_t TEDomain;
+      uint32_t Partitioning;
      uint32_t OutputTopology;
      uint32_t TessellationDistributionMode;
   } te;
@ -2251,7 +2258,7 @@ struct anv_gfx_dynamic_state {
      uint32_t BackfaceStencilPassDepthPassOp;
      uint32_t BackfaceStencilPassDepthFailOp;
      uint32_t BackfaceStencilTestFunction;
-   } ds;
+   } wm_ds;

   /* 3DSTATE_TBIMR_TILE_PASS_INFO */
   struct {
--- a/src/intel/vulkan/anv_shader_compile.c
+++ b/src/intel/vulkan/anv_shader_compile.c
@ -1054,6 +1054,7 @@ anv_shader_compile_bs(struct anv_device *device,
         .should_remat_callback = should_remat_cb,
      };

+      NIR_PASS(_, nir, brw_nir_lower_rt_intrinsics_pre_trace);
      NIR_PASS(_, nir, nir_lower_shader_calls, &opts,
               &resume_shaders, &num_resume_shaders, mem_ctx);
      NIR_PASS(_, nir, brw_nir_lower_shader_calls, &lowering_state);
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@ -490,7 +490,12 @@ anv_raster_polygon_mode(const struct anv_cmd_graphics_state *gfx,
      }
      UNREACHABLE("Unsupported GS output topology");
   } else if (gfx->shaders[MESA_SHADER_TESS_EVAL] != NULL) {
-      switch (get_gfx_tes_prog_data(gfx)->output_topology) {
+      struct brw_tess_info tess_info =
+         brw_merge_tess_info(
+            get_gfx_tcs_prog_data(gfx)->tess_info,
+            get_gfx_tes_prog_data(gfx)->tess_info);
+
+      switch (brw_tess_info_output_topology(tess_info)) {
      case INTEL_TESS_OUTPUT_TOPOLOGY_POINT:
         return VK_POLYGON_MODE_POINT;

@ -500,8 +505,10 @@ anv_raster_polygon_mode(const struct anv_cmd_graphics_state *gfx,
      case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW:
      case INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
         return polygon_mode;
+
+      default:
+         UNREACHABLE("Unsupported TCS output topology");
      }
-      UNREACHABLE("Unsupported TCS output topology");
   } else {
      switch (primitive_topology) {
      case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
@ -1317,6 +1324,22 @@ update_cps(struct anv_gfx_dynamic_state *hw_state,
 }
 #endif

+ALWAYS_INLINE static void
+update_ds(struct anv_gfx_dynamic_state *hw_state,
+          const struct anv_cmd_graphics_state *gfx)
+{
+   const struct brw_tes_prog_data *tes_prog_data = get_gfx_tes_prog_data(gfx);
+
+   if (tes_prog_data) {
+      struct brw_tess_info tess_info =
+         brw_merge_tess_info(get_gfx_tcs_prog_data(gfx)->tess_info,
+                             tes_prog_data->tess_info);
+
+      SET(DS, ds.ComputeWCoordinateEnable,
+              brw_tess_info_domain(tess_info) == INTEL_TESS_DOMAIN_TRI);
+   }
+}
+
 ALWAYS_INLINE static void
 update_te(struct anv_gfx_dynamic_state *hw_state,
          const struct anv_device *device,
@ -1326,16 +1349,28 @@ update_te(struct anv_gfx_dynamic_state *hw_state,
   const struct brw_tes_prog_data *tes_prog_data = get_gfx_tes_prog_data(gfx);

   if (tes_prog_data) {
+      struct brw_tess_info tess_info =
+         brw_merge_tess_info(get_gfx_tcs_prog_data(gfx)->tess_info,
+                             tes_prog_data->tess_info);
+
+      SET(TE, te.TEDomain, brw_tess_info_domain(tess_info));
+      SET(TE, te.Partitioning, brw_tess_info_partitioning(tess_info));
      if (dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
-         SET(TE, te.OutputTopology, tes_prog_data->output_topology);
+         SET(TE, te.OutputTopology, brw_tess_info_output_topology(tess_info));
      } else {
-            /* When the origin is upper-left, we have to flip the winding order */
-         if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
+         /* When the origin is upper-left, we have to flip the winding order */
+         enum intel_tess_output_topology output_topology =
+            brw_tess_info_output_topology(tess_info);
+         switch (output_topology) {
+         case  OUTPUT_TRI_CCW:
            SET(TE, te.OutputTopology, OUTPUT_TRI_CW);
-         } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
+            break;
+         case OUTPUT_TRI_CW:
            SET(TE, te.OutputTopology, OUTPUT_TRI_CCW);
-         } else {
-            SET(TE, te.OutputTopology, tes_prog_data->output_topology);
+            break;
+         default:
+            SET(TE, te.OutputTopology, output_topology);
+            break;
         }
      }

@ -1595,48 +1630,48 @@ update_wm_depth_stencil(struct anv_gfx_dynamic_state *hw_state,
   struct vk_depth_stencil_state opt_ds = dyn->ds;
   vk_optimize_depth_stencil_state(&opt_ds, ds_aspects, true);

-   SET(WM_DEPTH_STENCIL, ds.DoubleSidedStencilEnable, true);
+   SET(WM_DEPTH_STENCIL, wm_ds.DoubleSidedStencilEnable, true);

-   SET(WM_DEPTH_STENCIL, ds.StencilTestMask,
+   SET(WM_DEPTH_STENCIL, wm_ds.StencilTestMask,
       opt_ds.stencil.front.compare_mask & 0xff);
-   SET(WM_DEPTH_STENCIL, ds.StencilWriteMask,
+   SET(WM_DEPTH_STENCIL, wm_ds.StencilWriteMask,
       opt_ds.stencil.front.write_mask & 0xff);

-   SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
-   SET(WM_DEPTH_STENCIL, ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);
+   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilTestMask, opt_ds.stencil.back.compare_mask & 0xff);
+   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilWriteMask, opt_ds.stencil.back.write_mask & 0xff);

-   SET(WM_DEPTH_STENCIL, ds.StencilReferenceValue,
+   SET(WM_DEPTH_STENCIL, wm_ds.StencilReferenceValue,
       opt_ds.stencil.front.reference & 0xff);
-   SET(WM_DEPTH_STENCIL, ds.BackfaceStencilReferenceValue,
+   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilReferenceValue,
       opt_ds.stencil.back.reference & 0xff);

-   SET(WM_DEPTH_STENCIL, ds.DepthTestEnable, opt_ds.depth.test_enable);
-   SET(WM_DEPTH_STENCIL, ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
-   SET(WM_DEPTH_STENCIL, ds.DepthTestFunction,
+   SET(WM_DEPTH_STENCIL, wm_ds.DepthTestEnable, opt_ds.depth.test_enable);
+   SET(WM_DEPTH_STENCIL, wm_ds.DepthBufferWriteEnable, opt_ds.depth.write_enable);
+   SET(WM_DEPTH_STENCIL, wm_ds.DepthTestFunction,
                         vk_to_intel_compare_op[opt_ds.depth.compare_op]);
-   SET(WM_DEPTH_STENCIL, ds.StencilTestEnable, opt_ds.stencil.test_enable);
-   SET(WM_DEPTH_STENCIL, ds.StencilBufferWriteEnable,
+   SET(WM_DEPTH_STENCIL, wm_ds.StencilTestEnable, opt_ds.stencil.test_enable);
+   SET(WM_DEPTH_STENCIL, wm_ds.StencilBufferWriteEnable,
                         opt_ds.stencil.write_enable);
-   SET(WM_DEPTH_STENCIL, ds.StencilFailOp,
+   SET(WM_DEPTH_STENCIL, wm_ds.StencilFailOp,
                         vk_to_intel_stencil_op[opt_ds.stencil.front.op.fail]);
-   SET(WM_DEPTH_STENCIL, ds.StencilPassDepthPassOp,
+   SET(WM_DEPTH_STENCIL, wm_ds.StencilPassDepthPassOp,
                         vk_to_intel_stencil_op[opt_ds.stencil.front.op.pass]);
-   SET(WM_DEPTH_STENCIL, ds.StencilPassDepthFailOp,
+   SET(WM_DEPTH_STENCIL, wm_ds.StencilPassDepthFailOp,
                         vk_to_intel_stencil_op[
                            opt_ds.stencil.front.op.depth_fail]);
-   SET(WM_DEPTH_STENCIL, ds.StencilTestFunction,
+   SET(WM_DEPTH_STENCIL, wm_ds.StencilTestFunction,
                         vk_to_intel_compare_op[
                            opt_ds.stencil.front.op.compare]);
-   SET(WM_DEPTH_STENCIL, ds.BackfaceStencilFailOp,
+   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilFailOp,
                         vk_to_intel_stencil_op[
                            opt_ds.stencil.back.op.fail]);
-   SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthPassOp,
+   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilPassDepthPassOp,
                         vk_to_intel_stencil_op[
                            opt_ds.stencil.back.op.pass]);
-   SET(WM_DEPTH_STENCIL, ds.BackfaceStencilPassDepthFailOp,
+   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilPassDepthFailOp,
                         vk_to_intel_stencil_op[
                            opt_ds.stencil.back.op.depth_fail]);
-   SET(WM_DEPTH_STENCIL, ds.BackfaceStencilTestFunction,
+   SET(WM_DEPTH_STENCIL, wm_ds.BackfaceStencilTestFunction,
                         vk_to_intel_compare_op[
                            opt_ds.stencil.back.op.compare]);

@ -2335,11 +2370,14 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
      update_cps(hw_state, device, dyn);
 #endif /* GFX_VER >= 11 */

+   if (gfx->dirty & (ANV_CMD_DIRTY_HS | ANV_CMD_DIRTY_DS))
+      update_ds(hw_state, gfx);
+
   if (
 #if GFX_VERx10 >= 125
      (gfx->dirty & ANV_CMD_DIRTY_PRERASTER_SHADERS) ||
 #else
-      (gfx->dirty & ANV_CMD_DIRTY_DS) ||
+      (gfx->dirty & (ANV_CMD_DIRTY_HS | ANV_CMD_DIRTY_DS)) ||
 #endif
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN))
      update_te(hw_state, device, dyn, gfx);
@ -2506,10 +2544,14 @@ cmd_buffer_flush_gfx_runtime_state(struct anv_gfx_dynamic_state *hw_state,
       ((gfx->dirty & (ANV_CMD_DIRTY_HS | ANV_CMD_DIRTY_DS)) ||
        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS))) {
      assert(tcs_prog_data != NULL && tes_prog_data != NULL);
+      struct brw_tess_info tess_info =
+         brw_merge_tess_info(tcs_prog_data->tess_info,
+                             tes_prog_data->tess_info);
+
      SET(TESS_CONFIG, tess_config,
          intel_tess_config(dyn->ts.patch_control_points,
-                            tcs_prog_data->instances,
-                            tes_prog_data->domain,
+                            tcs_prog_data->output_vertices,
+                            brw_tess_info_domain(tess_info),
                            tcs_prog_data->base.vue_map.num_per_patch_slots,
                            tcs_prog_data->base.vue_map.num_per_vertex_slots,
                            tcs_prog_data->base.vue_map.builtins_slot_offset));
@ -2975,6 +3017,8 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
      if (anv_gfx_has_stage(gfx, MESA_SHADER_TESS_EVAL)) {
         anv_gfx_pack_merge(te, GENX(3DSTATE_TE),
                            MESA_SHADER_TESS_EVAL, ds.te, te) {
+            SET(te, te, TEDomain);
+            SET(te, te, Partitioning);
            SET(te, te, OutputTopology);
 #if GFX_VERx10 >= 125
            SET(te, te, TessellationDistributionMode);
@ -2986,27 +3030,27 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
   }

   if (IS_DIRTY(WM_DEPTH_STENCIL)) {
-      anv_gfx_pack(wm_ds, GENX(3DSTATE_WM_DEPTH_STENCIL), ds) {
-         SET(ds, ds, DoubleSidedStencilEnable);
-         SET(ds, ds, StencilTestMask);
-         SET(ds, ds, StencilWriteMask);
-         SET(ds, ds, BackfaceStencilTestMask);
-         SET(ds, ds, BackfaceStencilWriteMask);
-         SET(ds, ds, StencilReferenceValue);
-         SET(ds, ds, BackfaceStencilReferenceValue);
-         SET(ds, ds, DepthTestEnable);
-         SET(ds, ds, DepthBufferWriteEnable);
-         SET(ds, ds, DepthTestFunction);
-         SET(ds, ds, StencilTestEnable);
-         SET(ds, ds, StencilBufferWriteEnable);
-         SET(ds, ds, StencilFailOp);
-         SET(ds, ds, StencilPassDepthPassOp);
-         SET(ds, ds, StencilPassDepthFailOp);
-         SET(ds, ds, StencilTestFunction);
-         SET(ds, ds, BackfaceStencilFailOp);
-         SET(ds, ds, BackfaceStencilPassDepthPassOp);
-         SET(ds, ds, BackfaceStencilPassDepthFailOp);
-         SET(ds, ds, BackfaceStencilTestFunction);
+      anv_gfx_pack(wm_ds, GENX(3DSTATE_WM_DEPTH_STENCIL), wm_ds) {
+         SET(wm_ds, wm_ds, DoubleSidedStencilEnable);
+         SET(wm_ds, wm_ds, StencilTestMask);
+         SET(wm_ds, wm_ds, StencilWriteMask);
+         SET(wm_ds, wm_ds, BackfaceStencilTestMask);
+         SET(wm_ds, wm_ds, BackfaceStencilWriteMask);
+         SET(wm_ds, wm_ds, StencilReferenceValue);
+         SET(wm_ds, wm_ds, BackfaceStencilReferenceValue);
+         SET(wm_ds, wm_ds, DepthTestEnable);
+         SET(wm_ds, wm_ds, DepthBufferWriteEnable);
+         SET(wm_ds, wm_ds, DepthTestFunction);
+         SET(wm_ds, wm_ds, StencilTestEnable);
+         SET(wm_ds, wm_ds, StencilBufferWriteEnable);
+         SET(wm_ds, wm_ds, StencilFailOp);
+         SET(wm_ds, wm_ds, StencilPassDepthPassOp);
+         SET(wm_ds, wm_ds, StencilPassDepthFailOp);
+         SET(wm_ds, wm_ds, StencilTestFunction);
+         SET(wm_ds, wm_ds, BackfaceStencilFailOp);
+         SET(wm_ds, wm_ds, BackfaceStencilPassDepthPassOp);
+         SET(wm_ds, wm_ds, BackfaceStencilPassDepthFailOp);
+         SET(wm_ds, wm_ds, BackfaceStencilTestFunction);
      }
   }

@ -3230,8 +3274,12 @@ cmd_buffer_repack_gfx_state(struct anv_gfx_dynamic_state *hw_state,
   if (IS_DIRTY(HS))
      anv_gfx_copy_protected(hs, GENX(3DSTATE_HS), MESA_SHADER_TESS_CTRL, hs.hs);

-   if (IS_DIRTY(DS))
-      anv_gfx_copy_protected(ds, GENX(3DSTATE_DS), MESA_SHADER_TESS_EVAL, ds.ds);
+   if (IS_DIRTY(DS)) {
+      anv_gfx_pack_merge_protected(ds, GENX(3DSTATE_DS),
+                                   MESA_SHADER_TESS_EVAL, ds.ds, ds) {
+         SET(ds, ds, ComputeWCoordinateEnable);
+      }
+   }

   if (IS_DIRTY(GS)) {
      anv_gfx_pack_merge_protected(gs, GENX(3DSTATE_GS),
--- a/src/intel/vulkan/genX_shader.c
+++ b/src/intel/vulkan/genX_shader.c
@ -695,8 +695,6 @@ emit_ds_shader(struct anv_batch *batch,

   anv_shader_emit(batch, shader, ds.te, GENX(3DSTATE_TE), te) {
      te.TEEnable = true;
-      te.Partitioning = tes_prog_data->partitioning;
-      te.TEDomain = tes_prog_data->domain;
      te.MaximumTessellationFactorOdd = 63.0;
      te.MaximumTessellationFactorNotOdd = 64.0;
 #if GFX_VERx10 >= 125
@ -731,9 +729,6 @@ emit_ds_shader(struct anv_batch *batch,
      ds.BindingTableEntryCount = shader->bind_map.surface_count;
      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;

-      ds.ComputeWCoordinateEnable =
-         tes_prog_data->domain == INTEL_TESS_DOMAIN_TRI;
-
      ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
      ds.PatchURBEntryReadOffset = 0;
      ds.DispatchGRFStartRegisterForURBData =
--- a/src/nouveau/compiler/nak/sm80_instr_latencies.rs
+++ b/src/nouveau/compiler/nak/sm80_instr_latencies.rs
@ -588,8 +588,9 @@ impl RegLatencySM80 {
                | FP16 | FP16_Alu | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 3),
                Clmad => pred(has_pred, 5, 3),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
-                MMA_2x_collect => pred(has_pred, 12, 1),
+                IMMA_88 => pred(has_pred, 8, 1),
+                MMA_1x_collect => pred(has_pred, 11, 1),
+                MMA_2x_collect => pred(has_pred, 19, 1),
                DMMA => pred(has_pred, 20, 1),
                Cbu => 1,
                Decoupled => 1,
@ -603,8 +604,9 @@ impl RegLatencySM80 {
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 1),
                Clmad => pred(has_pred, 5, 1),
-                IMMA_88 | MMA_1x_collect => 8,
-                MMA_2x_collect => 12,
+                IMMA_88 => 8,
+                MMA_1x_collect => 11,
+                MMA_2x_collect => 19,
                DMMA => 20,
                Cbu => 1,
                Decoupled => 1,
@ -620,8 +622,9 @@ impl RegLatencySM80 {
                IMADWideWriteDH => pred(has_pred, 1, 1),
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 3),
                Clmad => pred(has_pred, 5, 3),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
-                MMA_2x_collect => pred(has_pred, 12, 1),
+                IMMA_88 => pred(has_pred, 8, 1),
+                MMA_1x_collect => pred(has_pred, 11, 1),
+                MMA_2x_collect => pred(has_pred, 19, 1),
                DMMA => pred(has_pred, 20, 1),
                Cbu => 1,
                Decoupled => 1,
@ -639,8 +642,9 @@ impl RegLatencySM80 {
                FP16 | FP16_Alu | FP16_F32 => pred(has_pred, 1, 2),
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 5, 3),
                Clmad => pred(has_pred, 5, 5),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 3),
-                MMA_2x_collect => pred(has_pred, 12, 3),
+                IMMA_88 => pred(has_pred, 8, 3),
+                MMA_1x_collect => pred(has_pred, 11, 3),
+                MMA_2x_collect => pred(has_pred, 19, 3),
                DMMA => pred(has_pred, 20, 3),
                Cbu => 1,
                Decoupled => 1,
@ -657,8 +661,9 @@ impl RegLatencySM80 {
                | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 5, 1),
                Clmad => pred(has_pred, 5, 3),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
-                MMA_2x_collect => pred(has_pred, 12, 1),
+                IMMA_88 => pred(has_pred, 8, 1),
+                MMA_1x_collect => pred(has_pred, 11, 1),
+                MMA_2x_collect => pred(has_pred, 19, 1),
                DMMA => pred(has_pred, 20, 1),
                Cbu => 1,
                Decoupled => 1,
@ -675,8 +680,9 @@ impl RegLatencySM80 {
                | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 3),
                Clmad => pred(has_pred, 5, 3),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
-                MMA_2x_collect => pred(has_pred, 12, 1),
+                IMMA_88 => pred(has_pred, 8, 1),
+                MMA_1x_collect => pred(has_pred, 11, 1),
+                MMA_2x_collect => pred(has_pred, 19, 1),
                DMMA => pred(has_pred, 20, 1),
                Cbu => 1,
                Decoupled => 1,
@ -690,8 +696,9 @@ impl RegLatencySM80 {
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 2),
                Clmad => pred(has_pred, 5, 2),
-                IMMA_88 | MMA_1x_collect => 8,
-                MMA_2x_collect => 12,
+                IMMA_88 => 8,
+                MMA_1x_collect => 11,
+                MMA_2x_collect => 19,
                DMMA => 20,
                Cbu => 1,
                Decoupled => 1,
@ -706,8 +713,9 @@ impl RegLatencySM80 {
                HFMA2_MMA => 2,
                RedirectedFP64 => 3,
                Clmad => pred(has_pred, 5, 1),
-                IMMA_88 | MMA_1x_collect => 8,
-                MMA_2x_collect => 12,
+                IMMA_88 => 8,
+                MMA_1x_collect => 11,
+                MMA_2x_collect => 19,
                DMMA => 20,
                Cbu => 1,
                Decoupled => 1,
@ -722,8 +730,9 @@ impl RegLatencySM80 {
                HFMA2_MMA => 2,
                RedirectedFP64 => 2,
                Clmad => pred(has_pred, 4, 2),
-                IMMA_88 | MMA_1x_collect => 7,
-                MMA_2x_collect => 11,
+                IMMA_88 => 7,
+                MMA_1x_collect => 10,
+                MMA_2x_collect => 18,
                DMMA => 19,
                Cbu => 1,
                Decoupled => 1,
@ -736,8 +745,9 @@ impl RegLatencySM80 {
                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
                | RedirectedFP64 | Clmad => 2,
-                IMMA_88 | MMA_1x_collect => 7,
-                MMA_2x_collect => 11,
+                IMMA_88 => 7,
+                MMA_1x_collect => 10,
+                MMA_2x_collect => 18,
                DMMA => 19,
                Cbu => 1,
                Decoupled => 1,
@ -750,8 +760,9 @@ impl RegLatencySM80 {
                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
                | RedirectedFP64 | Clmad => 2,
-                IMMA_88 | MMA_1x_collect => 4,
-                MMA_2x_collect => 8,
+                IMMA_88 => 4,
+                MMA_1x_collect => 8,
+                MMA_2x_collect => 16,
                DMMA => 17,
                Cbu => 1,
                Decoupled => 1,
@ -764,8 +775,9 @@ impl RegLatencySM80 {
                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
                | RedirectedFP64 | Clmad => 2,
-                IMMA_88 | MMA_1x_collect => 4,
-                MMA_2x_collect => 8,
+                IMMA_88 => 4,
+                MMA_1x_collect => 8,
+                MMA_2x_collect => 16,
                DMMA => 16,
                Cbu => 1,
                Decoupled => 1,
@ -781,8 +793,9 @@ impl RegLatencySM80 {
                }
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 1, 9),
                Clmad => pred(has_pred, 1, 11),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 7, 6),
-                MMA_2x_collect => pred(has_pred, 11, 6),
+                IMMA_88 => pred(has_pred, 7, 6),
+                MMA_1x_collect => pred(has_pred, 10, 5),
+                MMA_2x_collect => pred(has_pred, 18, 5),
                DMMA => pred(has_pred, 19, 6),
                Cbu => 1,
                Decoupled => 1,
@ -801,15 +814,25 @@ impl RegLatencySM80 {
        use RegLatencySM80::*;
        match writer {
            CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
-            | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
-            | RedirectedFP64 => match reader {
-                MMA_2x_collect => 7,
-                _ => 1,
-            },
-            Clmad | IMMA_88 | MMA_1x_collect | MMA_2x_collect | DMMA | Cbu
+            | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA => {
+                match reader {
+                    MMA_2x_collect => 7,
+                    _ => 1,
+                }
+            }
+            RedirectedFP64 => 1,
+            Clmad | IMMA_88 | MMA_1x_collect | MMA_2x_collect | DMMA
            | Decoupled | DecoupledAgu => match reader {
                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideReadAB
-                | IMADWideReadCL | IMADWideReadCH => 2,
+                | IMADWideReadCL | IMADWideReadCH | FP16 | FP16_Alu
+                | FP16_F32 | HFMA2_MMA => 2,
+                _ => 1,
+            },
+            Cbu => match reader {
+                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideReadAB
+                | IMADWideReadCL | IMADWideReadCH | FP16 | FP16_Alu
+                | FP16_F32 | HFMA2_MMA => 2,
+                MMA_2x_collect => 7,
                _ => 1,
            },
            _ => {
--- a/src/nouveau/compiler/nak_nir_lower_cmat.c
+++ b/src/nouveau/compiler/nak_nir_lower_cmat.c
@ -671,6 +671,43 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr)
                                     .matrix_layout = layout);
 }

+static nir_deref_instr*
+get_cmat_component_deref(nir_builder *b, nir_intrinsic_instr *intr,
+                         nir_def *lane_id, unsigned idx)
+{
+   unsigned deref_src = intr->intrinsic == nir_intrinsic_cmat_store ? 0 : 1;
+   unsigned cmat_src = intr->intrinsic == nir_intrinsic_cmat_store ? 1 : 0;
+
+   const struct glsl_cmat_description desc = cmat_src_desc(intr->src[cmat_src]);
+   nir_deref_instr *deref = nir_def_as_deref(intr->src[deref_src].ssa);
+
+   const enum glsl_matrix_layout layout = nir_intrinsic_matrix_layout(intr);
+   nir_def *stride = intr->src[2].ssa;
+
+   nir_def *col_offset;
+   nir_def *row_offset;
+   compute_matrix_offsets(b, desc, layout, lane_id, idx,
+                              &col_offset, &row_offset);
+
+   row_offset = nir_imul(b, row_offset, stride);
+   col_offset = nir_u2uN(b, col_offset, deref->def.bit_size);
+   row_offset = nir_u2uN(b, row_offset, deref->def.bit_size);
+
+   /* We have to ignore the incoming stride, but have to choose the type of
+    * the pointer as the declared stride is in multiple of the pointer type */
+   deref = nir_build_deref_cast(
+      b, &deref->def, deref->modes,
+      deref->type,
+      glsl_get_vector_elements(deref->type) * glsl_get_bit_size(deref->type) / 8
+   );
+   deref = nir_build_deref_ptr_as_array(b, deref, row_offset);
+   deref = nir_build_deref_cast(
+      b, &deref->def, deref->modes,
+      glsl_scalar_type(desc.element_type),
+      glsl_base_type_bit_size(desc.element_type) / 8);
+   return nir_build_deref_ptr_as_array(b, deref, col_offset);
+}
+
 static void
 lower_cmat_load(nir_builder *b, nir_intrinsic_instr *intr)
 {
@ -682,10 +719,6 @@ lower_cmat_load(nir_builder *b, nir_intrinsic_instr *intr)

   const struct glsl_cmat_description desc = cmat_src_desc(intr->src[0]);
   const unsigned length = get_cmat_length(desc);
-   const enum glsl_matrix_layout layout = nir_intrinsic_matrix_layout(intr);
-
-   nir_deref_instr *deref = nir_def_as_deref(intr->src[1].ssa);
-   nir_def *stride = intr->src[2].ssa;

   nir_def *vars[NIR_MAX_VEC_COMPONENTS];
   for (unsigned i = 0; i < length; ++i)
@ -694,26 +727,8 @@ lower_cmat_load(nir_builder *b, nir_intrinsic_instr *intr)
   nir_def *lane_id = nir_load_subgroup_invocation(b);

   for (unsigned idx = 0; idx < length; idx++) {
-      nir_def *col_offset;
-      nir_def *row_offset;
-
-      compute_matrix_offsets(b, desc, layout, lane_id, idx,
-                              &col_offset, &row_offset);
-
-      row_offset = nir_imul(b, row_offset, stride);
-
-      col_offset = nir_u2uN(b, col_offset, deref->def.bit_size);
-      row_offset = nir_u2uN(b, row_offset, deref->def.bit_size);
-
      nir_deref_instr *iter_deref =
-         nir_build_deref_ptr_as_array(b, deref, row_offset);
-      iter_deref = nir_build_deref_cast(
-         b, &iter_deref->def, deref->modes,
-         glsl_scalar_type(desc.element_type),
-         glsl_base_type_bit_size(desc.element_type) / 8);
-      iter_deref =
-         nir_build_deref_ptr_as_array(b, iter_deref, col_offset);
-
+         get_cmat_component_deref(b, intr, lane_id, idx);
      vars[idx] = nir_load_deref(b, iter_deref);
   }

@ -764,11 +779,6 @@ lower_cmat_instr(nir_builder *b,
   }

   case nir_intrinsic_cmat_store: {
-      enum glsl_matrix_layout layout = nir_intrinsic_matrix_layout(intr);
-
-      nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
-      nir_def *stride = intr->src[2].ssa;
-
      const struct glsl_cmat_description desc = cmat_src_desc(intr->src[1]);
      const unsigned length = get_cmat_length(desc);
      nir_def *src = load_cmat_src(b, intr->src[1]);
@ -780,26 +790,8 @@ lower_cmat_instr(nir_builder *b,
      nir_def *lane_id = nir_load_subgroup_invocation(b);

      for (unsigned idx = 0; idx < length; idx++) {
-         nir_def *col_offset;
-         nir_def *row_offset;
-
-         compute_matrix_offsets(b, desc, layout, lane_id, idx,
-                                &col_offset, &row_offset);
-
-         row_offset = nir_imul(b, row_offset, stride);
-
-         col_offset = nir_u2uN(b, col_offset, deref->def.bit_size);
-         row_offset = nir_u2uN(b, row_offset, deref->def.bit_size);
-
         nir_deref_instr *iter_deref =
-            nir_build_deref_ptr_as_array(b, deref, row_offset);
-         iter_deref = nir_build_deref_cast(
-            b, &iter_deref->def, deref->modes,
-            glsl_scalar_type(desc.element_type),
-            glsl_base_type_bit_size(desc.element_type) / 8);
-         iter_deref =
-            nir_build_deref_ptr_as_array(b, iter_deref, col_offset);
-
+            get_cmat_component_deref(b, intr, lane_id, idx);
         nir_store_deref(b, iter_deref, vars[idx], 1);
      }

--- a/src/nouveau/vulkan/nvk_buffer.c
+++ b/src/nouveau/vulkan/nvk_buffer.c
@ -11,6 +11,10 @@
 #include "nvk_queue.h"
 #include "nvkmd/nvkmd.h"

+#define NVK_BUFFER_CREATE_CAPTURE_REPLAY_BITS \
+   (VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT | \
+    VK_BUFFER_CREATE_DESCRIPTOR_BUFFER_CAPTURE_REPLAY_BIT_EXT)
+
 static uint32_t
 nvk_get_buffer_alignment(const struct nvk_physical_device *pdev,
                         VkBufferUsageFlags2KHR usage_flags,
@ -32,7 +36,7 @@ nvk_get_buffer_alignment(const struct nvk_physical_device *pdev,
      alignment = MAX2(alignment, NVK_DGC_ALIGN);

   if (create_flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
-                       VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT))
+                       NVK_BUFFER_CREATE_CAPTURE_REPLAY_BITS))
      alignment = MAX2(alignment, pdev->nvkmd->bind_align_B);

   return alignment;
@ -70,6 +74,22 @@ nvk_get_bda_replay_addr(const VkBufferCreateInfo *pCreateInfo)
         break;
      }

+      case VK_STRUCTURE_TYPE_OPAQUE_CAPTURE_DESCRIPTOR_DATA_CREATE_INFO_EXT: {
+         const VkOpaqueCaptureDescriptorDataCreateInfoEXT *dd = (void *)ext;
+         if (dd->opaqueCaptureDescriptorData != NULL) {
+            uint64_t dd_addr = 0;
+            memcpy(&dd_addr, dd->opaqueCaptureDescriptorData, sizeof(dd_addr));
+
+#ifdef NDEBUG
+            return dd_addr;
+#else
+            assert(addr == 0 || dd_addr == addr);
+            addr = dd_addr;
+#endif
+         }
+         break;
+      }
+
      default:
         break;
      }
@ -98,7 +118,7 @@ nvk_CreateBuffer(VkDevice device,

   if (buffer->vk.size > 0 &&
       (buffer->vk.create_flags & (VK_BUFFER_CREATE_SPARSE_BINDING_BIT |
-                                   VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT))) {
+                                   NVK_BUFFER_CREATE_CAPTURE_REPLAY_BITS))) {
      const uint32_t alignment =
         nvk_get_buffer_alignment(nvk_device_physical(dev),
                                  buffer->vk.usage,
@ -111,7 +131,7 @@ nvk_CreateBuffer(VkDevice device,
         va_flags |= NVKMD_VA_SPARSE;

      uint64_t fixed_addr = 0;
-      if (buffer->vk.create_flags & VK_BUFFER_CREATE_DEVICE_ADDRESS_CAPTURE_REPLAY_BIT) {
+      if (buffer->vk.create_flags & NVK_BUFFER_CREATE_CAPTURE_REPLAY_BITS) {
         va_flags |= NVKMD_VA_REPLAY;

         fixed_addr = nvk_get_bda_replay_addr(pCreateInfo);
@ -327,5 +347,10 @@ nvk_GetBufferOpaqueCaptureDescriptorDataEXT(
    const VkBufferCaptureDescriptorDataInfoEXT *pInfo,
    void *pData)
 {
+   VK_FROM_HANDLE(nvk_buffer, buffer, pInfo->buffer);
+   const uint64_t addr = vk_buffer_address(&buffer->vk, 0);
+
+   memcpy(pData, &addr, sizeof(addr));
+
   return VK_SUCCESS;
 }
--- a/src/nouveau/vulkan/nvk_cmd_buffer.c
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.c
@ -531,6 +531,21 @@ nvk_cmd_flush_wait_dep(struct nvk_cmd_buffer *cmd,
 {
   enum nvk_barrier barriers = 0;

+   /* For asymmetric, we don't know what the access flags will be yet.
+    * Handle this by setting access to everything.
+    */
+   if (dep->dependencyFlags & VK_DEPENDENCY_ASYMMETRIC_EVENT_BIT_KHR) {
+      /* VUID-vkCmdSetEvent2-dependencyFlags-10785, 10786, 10787 */
+      assert(dep->memoryBarrierCount == 1 &&
+             dep->bufferMemoryBarrierCount == 0 &&
+             dep->imageMemoryBarrierCount == 0);
+
+      const VkMemoryBarrier2 *bar = &dep->pMemoryBarriers[0];
+      barriers |= nvk_barrier_flushes_waits(bar->srcStageMask,
+                                            VK_ACCESS_2_MEMORY_READ_BIT |
+                                            VK_ACCESS_2_MEMORY_WRITE_BIT);
+   }
+
   for (uint32_t i = 0; i < dep->memoryBarrierCount; i++) {
      const VkMemoryBarrier2 *bar = &dep->pMemoryBarriers[i];
      barriers |= nvk_barrier_flushes_waits(bar->srcStageMask,
--- a/src/nouveau/vulkan/nvk_cmd_draw.c
+++ b/src/nouveau/vulkan/nvk_cmd_draw.c
@ -1822,17 +1822,28 @@ nvk_flush_vi_state(struct nvk_cmd_buffer *cmd)

   if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI) ||
       BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID)) {
-      u_foreach_bit(a, dyn->vi->attributes_valid) {
-         const struct nvk_va_format *fmt =
-            nvk_get_va_format(pdev, dyn->vi->attributes[a].format);
+      P_MTHD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(0));
+      for (uint32_t a = 0; a < 32; a++) {
+         if (dyn->vi->attributes_valid & BITFIELD_BIT(a)) {
+            const struct nvk_va_format *fmt =
+               nvk_get_va_format(pdev, dyn->vi->attributes[a].format);

-         P_IMMD(p, NV9097, SET_VERTEX_ATTRIBUTE_A(a), {
-            .stream                 = dyn->vi->attributes[a].binding,
-            .offset                 = dyn->vi->attributes[a].offset,
-            .component_bit_widths   = fmt->bit_widths,
-            .numerical_type         = fmt->type,
-            .swap_r_and_b           = fmt->swap_rb,
-         });
+            P_NV9097_SET_VERTEX_ATTRIBUTE_A(p, a, {
+               .stream                 = dyn->vi->attributes[a].binding,
+               .source                 = SOURCE_ACTIVE,
+               .offset                 = dyn->vi->attributes[a].offset,
+               .component_bit_widths   = fmt->bit_widths,
+               .numerical_type         = fmt->type,
+               .swap_r_and_b           = fmt->swap_rb,
+            });
+         } else {
+            P_NV9097_SET_VERTEX_ATTRIBUTE_A(p, a, {
+               .source                 = SOURCE_INACTIVE,
+               /* Using RGBA32 gives us (0, 0, 0, 0) for inactive attributes. */
+               .component_bit_widths   = COMPONENT_BIT_WIDTHS_R32_G32_B32_A32,
+               .numerical_type         = NUMERICAL_TYPE_NUM_FLOAT,
+            });
+         }
      }

      u_foreach_bit(b, dyn->vi->bindings_valid) {
--- a/src/nouveau/vulkan/nvk_descriptor_set.c
+++ b/src/nouveau/vulkan/nvk_descriptor_set.c
@ -1193,11 +1193,11 @@ nvk_GetDescriptorEXT(VkDevice _device,

   case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: {
      struct nvk_addr_range addr_range = { };
-      if (pDescriptorInfo->data.pUniformBuffer != NULL &&
-          pDescriptorInfo->data.pUniformBuffer->address != 0) {
+      if (pDescriptorInfo->data.pStorageBuffer != NULL &&
+          pDescriptorInfo->data.pStorageBuffer->address != 0) {
         addr_range = (const struct nvk_addr_range) {
-            .addr = pDescriptorInfo->data.pUniformBuffer->address,
-            .range = pDescriptorInfo->data.pUniformBuffer->range,
+            .addr = pDescriptorInfo->data.pStorageBuffer->address,
+            .range = pDescriptorInfo->data.pStorageBuffer->range,
         };
      }
      union nvk_buffer_descriptor desc = ssbo_desc(addr_range);
--- a/src/nouveau/vulkan/nvk_descriptor_set_layout.c
+++ b/src/nouveau/vulkan/nvk_descriptor_set_layout.c
@ -474,9 +474,6 @@ nvk_GetDescriptorSetLayoutSupport(VkDevice device,
         assert(stride <= UINT8_MAX);
         assert(util_is_power_of_two_nonzero(alignment));

-         variable_is_inline_uniform_block =
-            binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
-
         if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) {
            /* From the Vulkan 1.3.256 spec:
             *
@ -486,6 +483,9 @@ nvk_GetDescriptorSetLayoutSupport(VkDevice device,
             */
            variable_count = MAX2(1, binding->descriptorCount);
            variable_stride = stride;
+
+            variable_is_inline_uniform_block =
+               binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
         } else {
            /* Since we're aligning to the maximum and since this is just a
             * check for whether or not the max buffer size is big enough, we
@ -507,8 +507,6 @@ nvk_GetDescriptorSetLayoutSupport(VkDevice device,
   if (pCreateInfo->flags &
       VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)
      max_buffer_size = NVK_MAX_PUSH_DESCRIPTORS * nvk_max_descriptor_size(&pdev->info);
-   else if (variable_is_inline_uniform_block)
-      max_buffer_size = NVK_MAX_INLINE_UNIFORM_BLOCK_SIZE;
   else
      max_buffer_size = NVK_MAX_DESCRIPTOR_SET_SIZE;

@ -519,12 +517,21 @@ nvk_GetDescriptorSetLayoutSupport(VkDevice device,
      switch (ext->sType) {
      case VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT: {
         VkDescriptorSetVariableDescriptorCountLayoutSupport *vs = (void *)ext;
+         uint32_t max_var_count;
+
         if (variable_stride > 0) {
-            vs->maxVariableDescriptorCount =
+            max_var_count =
               (max_buffer_size - non_variable_size) / variable_stride;
         } else {
-            vs->maxVariableDescriptorCount = 0;
+            max_var_count = 0;
         }
+
+         if (variable_is_inline_uniform_block) {
+            max_var_count =
+               MIN2(max_var_count, NVK_MAX_INLINE_UNIFORM_BLOCK_SIZE);
+         }
+
+         vs->maxVariableDescriptorCount = max_var_count;
         break;
      }

--- a/src/nouveau/vulkan/nvk_physical_device.c
+++ b/src/nouveau/vulkan/nvk_physical_device.c
@ -1045,7 +1045,7 @@ nvk_get_device_properties(const struct nvk_instance *instance,
      .maxSamplerDescriptorBufferBindings = 32,
      .maxEmbeddedImmutableSamplerBindings = 32,
      .maxEmbeddedImmutableSamplers = 4000,
-      .bufferCaptureReplayDescriptorDataSize = 0,
+      .bufferCaptureReplayDescriptorDataSize = sizeof(uint64_t),
      .imageCaptureReplayDescriptorDataSize = 0,
      .imageViewCaptureReplayDescriptorDataSize =
         sizeof(struct nvk_image_view_capture),
@ -1141,8 +1141,13 @@ nvk_get_device_properties(const struct nvk_instance *instance,
      .robustStorageBufferAccessSizeAlignment = NVK_SSBO_BOUNDS_CHECK_ALIGNMENT,
      .robustUniformBufferAccessSizeAlignment = nvk_min_cbuf_alignment(info),

-      /* VK_EXT_sample_locations */
-      .sampleLocationSampleCounts = sample_counts,
+      /* VK_EXT_sample_locations
+       *
+       * There's a weird HW issue with per-sample interpolation for 1x.  It
+       * always interpolates at (0.5, 0.5) so we just disable custom sample
+       * locations for 1x.
+       */
+      .sampleLocationSampleCounts = sample_counts & ~VK_SAMPLE_COUNT_1_BIT,
      .maxSampleLocationGridSize = (VkExtent2D){ 1, 1 },
      .sampleLocationCoordinateRange[0] = 0.0f,
      .sampleLocationCoordinateRange[1] = 0.9375f,
@ -1267,6 +1272,9 @@ nvk_physical_device_init_pipeline_cache(struct nvk_physical_device *pdev)
   _mesa_sha1_update(&sha_ctx, instance->driver_build_sha,
                     sizeof(instance->driver_build_sha));

+   _mesa_sha1_update(&sha_ctx, &pdev->info.chipset,
+                     sizeof(pdev->info.chipset));
+
   const uint64_t compiler_flags = nvk_physical_device_compiler_flags(pdev);
   _mesa_sha1_update(&sha_ctx, &compiler_flags, sizeof(compiler_flags));

--- a/src/panfrost/clc/pan_compile.c
+++ b/src/panfrost/clc/pan_compile.c
@ -100,6 +100,12 @@ compile(void *memctx, const uint32_t *spirv, size_t spirv_size, unsigned arch)
   nir_shader *nir =
      spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
                   "library", &spirv_options, nir_options);
+   /* Workgroup size may be different between different entrypoints, so we
+    * mark it as variable to prevent it from being lowered to a constant while
+    * we are still processing all entrypoints together. This is tempoary,
+    * nir_precompiled_build_variant will set the fixed workgroup size for each
+    * entrypoint and set workgroup_size_variable back to false. */
+   nir->info.workgroup_size_variable = true;
   nir_validate_shader(nir, "after spirv_to_nir");
   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
   ralloc_steal(memctx, nir);
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@ -2557,7 +2557,8 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
   uint32_t patch_attribs =
      cmdbuf->state.gfx.vi.attribs_changing_on_base_instance;
   uint32_t vs_res_table_size =
-      panvk_shader_res_table_count(&cmdbuf->state.gfx.vs.desc);
+      panvk_shader_res_table_count(&cmdbuf->state.gfx.vs.desc) *
+      pan_size(RESOURCE);
   bool patch_faus = shader_uses_sysval(vs, graphics, vs.first_vertex) ||
                     shader_uses_sysval(vs, graphics, vs.base_instance);
   struct cs_index draw_params_addr = cs_scratch_reg64(b, 0);
@ -2583,6 +2584,9 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
   if (patch_faus)
      cs_move64_to(b, vs_fau_addr, cmdbuf->state.gfx.vs.push_uniforms);

+   if (patch_attribs != 0)
+      cs_move64_to(b, vs_drv_set, vs_desc_state->driver_set.dev_addr);
+
   cs_move64_to(b, draw_params_addr, draw->indirect.buffer_dev_addr);
   cs_move32_to(b, draw_id, 0);

@ -2610,8 +2614,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
      }

      if (patch_attribs != 0) {
-         cs_move64_to(b, vs_drv_set, vs_desc_state->driver_set.dev_addr);
-
         /* If firstInstance=0, skip the offset adjustment. */
         cs_if(b, MALI_CS_CONDITION_NEQUAL,
               cs_sr_reg32(b, IDVS, INSTANCE_OFFSET)) {
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c
@ -922,12 +922,11 @@ panvk_per_arch(get_physical_device_properties)(
         MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS,
      .maxInlineUniformTotalSize =
         MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS * MAX_INLINE_UNIFORM_BLOCK_SIZE,
-      .integerDotProduct8BitUnsignedAccelerated = true,
-      .integerDotProduct8BitSignedAccelerated = true,
+      .integerDotProduct8BitUnsignedAccelerated = false,
+      .integerDotProduct8BitSignedAccelerated = false,
      .integerDotProduct8BitMixedSignednessAccelerated = false,
-      .integerDotProduct4x8BitPackedUnsignedAccelerated = true,
-      .integerDotProduct4x8BitPackedSignedAccelerated = true,
-      .integerDotProduct4x8BitPackedSignedAccelerated = false,
+      .integerDotProduct4x8BitPackedUnsignedAccelerated = PAN_ARCH >= 9,
+      .integerDotProduct4x8BitPackedSignedAccelerated = PAN_ARCH >= 9,
      .integerDotProduct16BitUnsignedAccelerated = false,
      .integerDotProduct16BitSignedAccelerated = false,
      .integerDotProduct16BitMixedSignednessAccelerated = false,
@ -940,8 +939,8 @@ panvk_per_arch(get_physical_device_properties)(
      .integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false,
      .integerDotProductAccumulatingSaturating8BitSignedAccelerated = false,
      .integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false,
-      .integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = false,
-      .integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = PAN_ARCH >= 9,
+      .integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = PAN_ARCH >= 9,
      .integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = false,
      .integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false,
      .integerDotProductAccumulatingSaturating16BitSignedAccelerated = false,
--- a/src/panfrost/vulkan/panvk_vX_shader.c
+++ b/src/panfrost/vulkan/panvk_vX_shader.c
@ -1551,10 +1551,12 @@ shader_desc_info_deserialize(struct panvk_device *dev,
 #if PAN_ARCH < 9
   shader->desc_info.dyn_ubos.count = blob_read_uint32(blob);
   blob_copy_bytes(blob, shader->desc_info.dyn_ubos.map,
-                   shader->desc_info.dyn_ubos.count);
+                   sizeof(*shader->desc_info.dyn_ubos.map) *
+                      shader->desc_info.dyn_ubos.count);
   shader->desc_info.dyn_ssbos.count = blob_read_uint32(blob);
   blob_copy_bytes(blob, shader->desc_info.dyn_ssbos.map,
-                   shader->desc_info.dyn_ssbos.count);
+                   sizeof(*shader->desc_info.dyn_ssbos.map) *
+                      shader->desc_info.dyn_ssbos.count);

   uint32_t others_count = 0;
   for (unsigned i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
@ -1582,6 +1584,7 @@ shader_desc_info_deserialize(struct panvk_device *dev,
   blob_copy_bytes(blob, shader->desc_info.dyn_bufs.map,
                   sizeof(*shader->desc_info.dyn_bufs.map) *
                      shader->desc_info.dyn_bufs.count);
+   shader->desc_info.max_varying_loads = blob_read_uint32(blob);
 #endif

   return VK_SUCCESS;
@ -1714,6 +1717,7 @@ shader_desc_info_serialize(struct blob *blob,
   blob_write_bytes(blob, shader->desc_info.dyn_bufs.map,
                    sizeof(*shader->desc_info.dyn_bufs.map) *
                       shader->desc_info.dyn_bufs.count);
+   blob_write_uint32(blob, shader->desc_info.max_varying_loads);
 #endif
 }

--- a/src/util/00-mesa-defaults.conf
+++ b/src/util/00-mesa-defaults.conf
@ -1087,6 +1087,9 @@ TODO: document the other workarounds.
        <application name="Jusant" executable="ASC-Win64-Shipping.exe">
            <option name="force_vk_vendor" value="-1"/>
        </application>
+        <application name="Wuthering Waves" executable="Client-Win64-Shipping.exe">
+            <option name="force_vk_vendor" value="-1"/>
+        </application>
        <application name="DIRT 5" executable="DIRT5.exe">
            <option name="fp64_workaround_enabled" value="true" />
        </application>
--- a/src/util/cache_ops.h
+++ b/src/util/cache_ops.h
@ -28,6 +28,7 @@
 #include <stddef.h>

 #include "detect_arch.h"
+#include "u_cpu_detect.h"

 #ifdef __cplusplus
 extern "C" {
@ -44,7 +45,13 @@ util_has_cache_ops(void)
   return false;
 #endif

-   return DETECT_ARCH_X86 || DETECT_ARCH_X86_64 || DETECT_ARCH_AARCH64;
+#if DETECT_ARCH_X86
+   return util_get_cpu_caps()->has_sse2;
+#elif DETECT_ARCH_X86_64 || DETECT_ARCH_AARCH64
+   return true;
+#else
+   return false;
+#endif
 }

 /** Returns the cache granularity
--- a/src/util/meson.build
+++ b/src/util/meson.build
@ -188,7 +188,14 @@ libmesa_util_links = []
 if host_machine.cpu_family() == 'aarch64' and cc.get_id() != 'msvc'
  files_mesa_util += files('cache_ops_aarch64.c')
 elif host_machine.cpu_family() in ['x86', 'x86_64'] and cc.get_id() != 'msvc'
-  files_mesa_util += files('cache_ops_x86.c')
+  libmesa_util_clflush = static_library(
+    'mesa_util_clflush',
+    ['cache_ops_x86.c'],
+    include_directories : [inc_util],
+    c_args : [no_override_init_args, sse2_args],
+    gnu_symbol_visibility : 'hidden',
+  )
+  libmesa_util_links += [libmesa_util_clflush]
  if with_clflushopt
    libmesa_util_clflushopt = static_library(
      'mesa_util_clflushopt',
@ -197,7 +204,7 @@ elif host_machine.cpu_family() in ['x86', 'x86_64'] and cc.get_id() != 'msvc'
      c_args : [no_override_init_args] + clflushopt_args,
      gnu_symbol_visibility : 'hidden',
    )
-    libmesa_util_links += libmesa_util_clflushopt
+    libmesa_util_links += [libmesa_util_clflushopt]
  endif
 else
  files_mesa_util += files('cache_ops_null.c')
--- a/src/util/os_file.c
+++ b/src/util/os_file.c
@ -288,7 +288,7 @@ os_same_file_description(int fd1, int fd2)
   if (efd < 0)
      return -1;

-   struct epoll_event evt = {};
+   struct epoll_event evt = {0};
   /* Get a new file descriptor number for fd1. */
   int tmp = os_dupfd_cloexec(fd1);
   /* Add it to evt. */
--- a/src/vulkan/runtime/vk_render_pass.c
+++ b/src/vulkan/runtime/vk_render_pass.c
@ -1087,6 +1087,7 @@ vk_get_command_buffer_inheritance_as_rendering_resume(
         &subpass->color_attachments[i];
      if (sp_att->attachment == VK_ATTACHMENT_UNUSED) {
         attachments[i] = (VkRenderingAttachmentInfo) {
+            .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
            .imageView = VK_NULL_HANDLE,
         };
         continue;
--- a/src/vulkan/wsi/wsi_common_drm.c
+++ b/src/vulkan/wsi/wsi_common_drm.c
@ -757,6 +757,7 @@ wsi_create_native_image_mem(const struct wsi_swapchain *chain,
       * handling implict sync ourselves.
       */
      .implicit_sync = !info->explicit_sync && !chain->dma_buf_semaphore,
+      .dma_buf_sync_file = chain->dma_buf_semaphore,
   };
   const VkExportMemoryAllocateInfo memory_export_info = {
      .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
--- a/src/vulkan/wsi/wsi_common_private.h
+++ b/src/vulkan/wsi/wsi_common_private.h
@ -201,9 +201,6 @@ struct wsi_swapchain {
   VkAllocationCallbacks alloc;
   VkFence* fences;
   VkPresentModeKHR present_mode;
-   VkPresentGravityFlagsEXT present_gravity_x;
-   VkPresentGravityFlagsEXT present_gravity_y;
-
   /**
    * Timeline for presents completing according to VK_KHR_present_wait.  The
    * present should complete as close as possible (before or after!) to the
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@ -925,6 +925,7 @@ struct Colorspace {
   enum wp_color_manager_v1_primaries primaries;
   enum wp_color_manager_v1_transfer_function tf;
   bool should_use_hdr_metadata;
+   bool needs_extended_range;
 };
 struct Colorspace colorspace_mapping[] = {
   {
@ -932,48 +933,56 @@ struct Colorspace colorspace_mapping[] = {
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_SRGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_SRGB,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_DISPLAY_P3,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_SRGB,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_SRGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = true,
+      .needs_extended_range = true,
   },
   {
      .colorspace = VK_COLOR_SPACE_DISPLAY_P3_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_DISPLAY_P3,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_BT709_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_SRGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_BT709_NONLINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_SRGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_BT1886,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_BT2020_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_BT2020,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_HDR10_ST2084_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_BT2020,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_ST2084_PQ,
      .should_use_hdr_metadata = true,
+      .needs_extended_range = false,
   },
   /* VK_COLOR_SPACE_DOLBYVISION_EXT is left out because it's deprecated */
   {
@ -981,22 +990,21 @@ struct Colorspace colorspace_mapping[] = {
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_BT2020,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_HLG,
      .should_use_hdr_metadata = true,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_ADOBE_RGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   /* VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT is left out because there's no
    * exactly matching transfer function in the Wayland protocol */
   /* VK_COLOR_SPACE_PASS_THROUGH_EXT is handled elsewhere */
-   {
-      .colorspace = VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT,
-      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_SRGB,
-      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_SRGB,
-      .should_use_hdr_metadata = true,
-   },
+   /* VK_COLOR_SPACE_EXTENDED_SRGB_NONLINEAR_EXT is intentionally not added
+    * as it's a bit unclear how exactly it should be used
+    * and whether or not the transfer function should be gamma 2.2 or piece-wise */
   /* VK_COLOR_SPACE_DISPLAY_NATIVE_AMD isn't supported */
   /* VK_COLORSPACE_SRGB_NONLINEAR_KHR is just an alias */
   /* VK_COLOR_SPACE_DCI_P3_LINEAR_EXT is just an alias */
@ -1033,6 +1041,9 @@ wsi_wl_display_determine_colorspaces(struct wsi_wl_display *display)
         continue;
      if (!vector_contains(tfs, colorspace_mapping[i].tf))
         continue;
+      if (!display->color_features.extended_target_volume &&
+          colorspace_mapping[i].needs_extended_range)
+         continue;
      VkColorSpaceKHR *new_cs = u_vector_add(&display->colorspaces);
      if (!new_cs)
         return -1;
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@ -810,10 +810,11 @@ x11_surface_get_capabilities2(VkIcdSurfaceBase *icd_surface,
      }

      case VK_STRUCTURE_TYPE_SURFACE_PRESENT_SCALING_CAPABILITIES_EXT: {
+         /* Unsupported. */
         VkSurfacePresentScalingCapabilitiesEXT *scaling = (void *)ext;
-         scaling->supportedPresentScaling = VK_PRESENT_SCALING_ONE_TO_ONE_BIT_EXT;
-         scaling->supportedPresentGravityX = VK_PRESENT_GRAVITY_MIN_BIT_EXT | VK_PRESENT_GRAVITY_MAX_BIT_EXT | VK_PRESENT_GRAVITY_CENTERED_BIT_EXT;
-         scaling->supportedPresentGravityY = VK_PRESENT_GRAVITY_MIN_BIT_EXT | VK_PRESENT_GRAVITY_MAX_BIT_EXT | VK_PRESENT_GRAVITY_CENTERED_BIT_EXT;
+         scaling->supportedPresentScaling = 0;
+         scaling->supportedPresentGravityX = 0;
+         scaling->supportedPresentGravityY = 0;
         scaling->minScaledImageExtent = caps->surfaceCapabilities.minImageExtent;
         scaling->maxScaledImageExtent = caps->surfaceCapabilities.maxImageExtent;
         break;
@ -1458,46 +1459,7 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
         .serial = serial,
      };

-   int16_t x_off = 0;
-   int16_t y_off = 0;
-
-   xcb_get_geometry_reply_t *geometry =
-      xcb_get_geometry_reply(chain->conn, xcb_get_geometry(chain->conn, chain->window), NULL);
-
-   if (geometry) {
-      switch (chain->base.present_gravity_x) {
-      case VK_PRESENT_GRAVITY_MIN_BIT_EXT:
-         x_off = 0;
-         break;
-      case VK_PRESENT_GRAVITY_MAX_BIT_EXT:
-         x_off = geometry->width - chain->extent.width;
-         break;
-      case VK_PRESENT_GRAVITY_CENTERED_BIT_EXT:
-         x_off = (geometry->width / 2) - (chain->extent.width / 2);
-         break;
-      default:
-         x_off = 0;
-      }
-
-      switch (chain->base.present_gravity_y) {
-      case VK_PRESENT_GRAVITY_MIN_BIT_EXT:
-         y_off = 0;
-         break;
-      case VK_PRESENT_GRAVITY_MAX_BIT_EXT:
-         y_off = geometry->height - chain->extent.height;
-         break;
-      case VK_PRESENT_GRAVITY_CENTERED_BIT_EXT:
-         y_off = (geometry->height / 2) - (chain->extent.height / 2);
-         break;
-      default:
-         y_off = 0;
-      }
-
-      free(geometry);
-   }
-
   xcb_void_cookie_t cookie;
-
 #ifdef HAVE_DRI3_EXPLICIT_SYNC
   if (chain->base.image_info.explicit_sync) {
      uint64_t acquire_point = image->base.explicit_sync[WSI_ES_ACQUIRE].timeline;
@ -1509,8 +1471,8 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
         serial,
         0,                                   /* valid */
         image->update_area,                  /* update */
-         x_off,                               /* x_off */
-         y_off,                               /* y_off */
+         0,                                   /* x_off */
+         0,                                   /* y_off */
         XCB_NONE,                            /* target_crtc */
         image->dri3_syncobj[WSI_ES_ACQUIRE], /* acquire_syncobj */
         image->dri3_syncobj[WSI_ES_RELEASE], /* release_syncobj */
@ -1529,8 +1491,8 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
                                  serial,
                                  0,                  /* valid */
                                  image->update_area, /* update */
-                                  x_off,              /* x_off */
-                                  y_off,              /* y_off */
+                                  0,                  /* x_off */
+                                  0,                  /* y_off */
                                  XCB_NONE,           /* target_crtc */
                                  XCB_NONE,
                                  image->sync_fence,
@ -2788,14 +2750,6 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
   chain->has_mit_shm = wsi_conn->has_mit_shm;
   chain->has_async_may_tear = present_caps & XCB_PRESENT_CAPABILITY_ASYNC_MAY_TEAR;

-   const VkSwapchainPresentScalingCreateInfoEXT* scaling_info =
-      vk_find_struct_const(pCreateInfo->pNext, SWAPCHAIN_PRESENT_SCALING_CREATE_INFO_EXT);
-
-   if (scaling_info) {
-      chain->base.present_gravity_x = scaling_info->presentGravityX;
-      chain->base.present_gravity_y = scaling_info->presentGravityY;
-   }
-
   /* When images in the swapchain don't fit the window, X can still present them, but it won't
    * happen by flip, only by copy. So this is a suboptimal copy, because if the client would change
    * the chain extents X may be able to flip