VERSION: bump for rc2

Signed-off-by: Dylan Baker <dylan.c.baker@intel.com>
radv/video_enc: Cleanup slice count assert
2025-12-20 07:20:10 +01:00 · 2025-10-22 16:13:33 -07:00 · 2025-10-22 09:03:39 -07:00 · 2025-10-22 09:03:39 -07:00 · 2025-10-22 09:03:38 -07:00 · 2025-10-22 09:03:37 -07:00
39 changed files with 3212 additions and 372 deletions
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@ -1 +1 @@
-25.3.0-devel
+25.3.0-rc2
--- a/docs/ci/LAVA.rst
+++ b/docs/ci/LAVA.rst
@ -122,9 +122,8 @@ Enable the site and restart nginx:
   # Second download should be cached.
   wget http://localhost/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public/itoral-gl-terrain-demo/demo-v2.trace

-Now, set ``download-url`` in your ``traces-*.yml`` entry to something like
-``http://caching-proxy/cache/?uri=https://s3.freedesktop.org/mesa-tracie-public``
-and you should have cached downloads for traces.  Add it to
-``FDO_HTTP_CACHE_URI=`` in your ``config.toml`` runner environment lines and you
-can use it for cached artifact downloads instead of going all the way to
-freedesktop.org on each job.
+The trace runner script automatically sets the caching proxy, so there's no
+need to modify anything in the Mesa CI YAML files.
+Add ``LAVA_HTTP_CACHE_URI=http://localhost/cache/?uri=`` to your ``config.toml``
+runner environment lines and you can use it for cached artifact downloads
+instead of going all the way to freedesktop.org on each job.
--- a/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c
+++ b/src/amd/common/nir/ac_nir_lower_mem_access_bit_sizes.c
@ -109,23 +109,37 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
   nir_mem_access_size_align res;

   if (intrin == nir_intrinsic_load_shared || intrin == nir_intrinsic_store_shared) {
-      /* Split unsupported shared access. */
-      res.bit_size = MIN2(bit_size, combined_align * 8ull);
-      res.align = res.bit_size / 8;
      /* Don't use >64-bit LDS loads for performance reasons. */
      unsigned max_bytes = intrin == nir_intrinsic_store_shared && cb_data->gfx_level >= GFX7 ? 16 : 8;
      bytes = MIN3(bytes, combined_align, max_bytes);
      bytes = bytes == 12 ? bytes : round_down_to_power_of_2(bytes);
+
+      /* Split unsupported shared access. */
+      res.bit_size = MIN2(bit_size, bytes * 8ull);
+      res.align = res.bit_size / 8;
      res.num_components = bytes / res.align;
      res.shift = nir_mem_access_shift_method_bytealign_amd;
      return res;
   }

+   const bool is_buffer_load = intrin == nir_intrinsic_load_ubo ||
+                               intrin == nir_intrinsic_load_ssbo ||
+                               intrin == nir_intrinsic_load_constant;
+
   if (is_smem) {
+      const bool supported_subdword = cb_data->gfx_level >= GFX12 &&
+                                      intrin != nir_intrinsic_load_push_constant &&
+                                      (!cb_data->use_llvm || intrin != nir_intrinsic_load_ubo);
+
      /* Round up subdword loads if unsupported. */
-      const bool supported_subdword = cb_data->gfx_level >= GFX12 && intrin != nir_intrinsic_load_push_constant;
-      if (bit_size < 32 && (bytes >= 3 || !supported_subdword))
+      if (bytes <= 2 && combined_align % bytes == 0 && supported_subdword) {
+         bit_size = bytes * 8;
+      } else if (bytes % 4 || combined_align % 4) {
+         if (is_buffer_load)
+            bytes += 4 - MIN2(combined_align, 4);
         bytes = align(bytes, 4);
+         bit_size = 32;
+      }

      /* Generally, require an alignment of 4. */
      res.align = MIN2(4, bytes);
@ -138,9 +152,6 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui
      if (!util_is_power_of_two_nonzero(bytes) && (cb_data->gfx_level < GFX12 || bytes != 12)) {
         const uint8_t larger = util_next_power_of_two(bytes);
         const uint8_t smaller = larger / 2;
-         const bool is_buffer_load = intrin == nir_intrinsic_load_ubo ||
-                                     intrin == nir_intrinsic_load_ssbo ||
-                                     intrin == nir_intrinsic_load_constant;
         const bool is_aligned = align_mul % smaller == 0;

         /* Overfetch up to 1 dword if this is a bounds-checked buffer load or the access is aligned. */
@ -185,8 +196,8 @@ lower_mem_access_cb(nir_intrinsic_op intrin, uint8_t bytes, uint8_t bit_size, ui

   const uint32_t max_pad = 4 - MIN2(combined_align, 4);

-   /* Global loads don't have bounds checking, so increasing the size might not be safe. */
-   if (intrin == nir_intrinsic_load_global || intrin == nir_intrinsic_load_global_constant) {
+   /* Global/scratch loads don't have bounds checking, so increasing the size might not be safe. */
+   if (!is_buffer_load) {
      if (align_mul < 4) {
         /* If we split the load, only lower it to 32-bit if this is a SMEM load. */
         const unsigned chunk_bytes = align(bytes, 4) - max_pad;
--- a/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
+++ b/src/amd/common/nir/ac_nir_lower_ngg_mesh.c
@ -508,6 +508,8 @@ lower_ms_intrinsic(nir_builder *b, nir_instr *instr, void *state)
      return update_ms_barrier(b, intrin, s);
   case nir_intrinsic_load_workgroup_index:
      return lower_ms_load_workgroup_index(b, intrin, s);
+   case nir_intrinsic_load_num_subgroups:
+      return nir_imm_int(b, DIV_ROUND_UP(s->api_workgroup_size, s->wave_size));
   case nir_intrinsic_set_vertex_and_primitive_count:
      return lower_ms_set_vertex_and_primitive_count(b, intrin, s);
   default:
@ -529,6 +531,7 @@ filter_ms_intrinsic(const nir_instr *instr,
          intrin->intrinsic == nir_intrinsic_store_per_primitive_output ||
          intrin->intrinsic == nir_intrinsic_barrier ||
          intrin->intrinsic == nir_intrinsic_load_workgroup_index ||
+          intrin->intrinsic == nir_intrinsic_load_num_subgroups ||
          intrin->intrinsic == nir_intrinsic_set_vertex_and_primitive_count;
 }

--- a/src/amd/vulkan/radv_sqtt.c
+++ b/src/amd/vulkan/radv_sqtt.c
@ -508,7 +508,9 @@ radv_begin_sqtt(struct radv_queue *queue)
      device->sqtt.start_cs[family] = NULL;
   }

-   cs.b = ws->cs_create(ws, radv_queue_ring(queue), false);
+   radv_init_cmd_stream(&cs, radv_queue_ring(queue));
+
+   cs.b = ws->cs_create(ws, cs.hw_ip, false);
   if (!cs.b)
      return false;

@ -585,7 +587,9 @@ radv_end_sqtt(struct radv_queue *queue)
      device->sqtt.stop_cs[family] = NULL;
   }

-   cs.b = ws->cs_create(ws, radv_queue_ring(queue), false);
+   radv_init_cmd_stream(&cs, radv_queue_ring(queue));
+
+   cs.b = ws->cs_create(ws, cs.hw_ip, false);
   if (!cs.b)
      return false;

--- a/src/amd/vulkan/radv_video.c
+++ b/src/amd/vulkan/radv_video.c
@ -819,6 +819,32 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
   if (cap && !cap->valid)
      cap = NULL;

+   if (cap) {
+      pCapabilities->maxCodedExtent.width = cap->max_width;
+      pCapabilities->maxCodedExtent.height = cap->max_height;
+   } else {
+      switch (pVideoProfile->videoCodecOperation) {
+      case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
+         pCapabilities->maxCodedExtent.width = (pdev->info.family < CHIP_TONGA) ? 2048 : 4096;
+         pCapabilities->maxCodedExtent.height = (pdev->info.family < CHIP_TONGA) ? 1152 : 4096;
+         break;
+      case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
+         pCapabilities->maxCodedExtent.width =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
+         pCapabilities->maxCodedExtent.height =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
+         break;
+      case VK_VIDEO_CODEC_OPERATION_DECODE_VP9_BIT_KHR:
+         pCapabilities->maxCodedExtent.width =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
+         pCapabilities->maxCodedExtent.height =
+            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
+         break;
+      default:
+         break;
+      }
+   }
+
   pCapabilities->flags = 0;
   pCapabilities->pictureAccessGranularity.width = VK_VIDEO_H264_MACROBLOCK_WIDTH;
   pCapabilities->pictureAccessGranularity.height = VK_VIDEO_H264_MACROBLOCK_HEIGHT;
@ -1126,32 +1152,6 @@ radv_GetPhysicalDeviceVideoCapabilitiesKHR(VkPhysicalDevice physicalDevice, cons
      break;
   }

-   if (cap) {
-      pCapabilities->maxCodedExtent.width = cap->max_width;
-      pCapabilities->maxCodedExtent.height = cap->max_height;
-   } else {
-      switch (pVideoProfile->videoCodecOperation) {
-      case VK_VIDEO_CODEC_OPERATION_DECODE_H264_BIT_KHR:
-         pCapabilities->maxCodedExtent.width = (pdev->info.family < CHIP_TONGA) ? 2048 : 4096;
-         pCapabilities->maxCodedExtent.height = (pdev->info.family < CHIP_TONGA) ? 1152 : 4096;
-         break;
-      case VK_VIDEO_CODEC_OPERATION_DECODE_H265_BIT_KHR:
-         pCapabilities->maxCodedExtent.width =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
-         pCapabilities->maxCodedExtent.height =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
-         break;
-      case VK_VIDEO_CODEC_OPERATION_DECODE_VP9_BIT_KHR:
-         pCapabilities->maxCodedExtent.width =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 2048 : 4096) : 8192;
-         pCapabilities->maxCodedExtent.height =
-            (pdev->info.family < CHIP_RENOIR) ? ((pdev->info.family < CHIP_TONGA) ? 1152 : 4096) : 4352;
-         break;
-      default:
-         break;
-      }
-   }
-
   return VK_SUCCESS;
 }

--- a/src/amd/vulkan/radv_video_enc.c
+++ b/src/amd/vulkan/radv_video_enc.c
@ -890,7 +890,6 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
   uint32_t num_bits[RENCODE_SLICE_HEADER_TEMPLATE_MAX_NUM_INSTRUCTIONS] = {0};
   const struct VkVideoEncodeH264PictureInfoKHR *h264_picture_info =
      vk_find_struct_const(enc_info->pNext, VIDEO_ENCODE_H264_PICTURE_INFO_KHR);
-   int slice_count = h264_picture_info->naluSliceEntryCount;
   const StdVideoEncodeH264PictureInfo *pic = h264_picture_info->pStdPictureInfo;
   const StdVideoH264SequenceParameterSet *sps =
      vk_video_find_h264_enc_std_sps(cmd_buffer->video.params, pic->seq_parameter_set_id);
@ -903,8 +902,6 @@ radv_enc_slice_header(struct radv_cmd_buffer *cmd_buffer, const VkVideoEncodeInf
   unsigned int cdw_filled = 0;
   unsigned int bits_copied = 0;

-   assert(slice_count <= 1);
-
   struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
   const struct radv_physical_device *pdev = radv_device_physical(device);
   struct radv_cmd_stream *cs = cmd_buffer->cs;
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -319,14 +319,10 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
   case nir_intrinsic_load_base_global_invocation_id:
   case nir_intrinsic_load_base_workgroup_id:
   case nir_intrinsic_load_alpha_reference_amd:
-   case nir_intrinsic_load_ubo_uniform_block_intel:
-   case nir_intrinsic_load_ssbo_uniform_block_intel:
-   case nir_intrinsic_load_shared_uniform_block_intel:
   case nir_intrinsic_load_barycentric_optimize_amd:
   case nir_intrinsic_load_poly_line_smooth_enabled:
   case nir_intrinsic_load_rasterization_primitive_amd:
   case nir_intrinsic_unit_test_uniform_amd:
-   case nir_intrinsic_load_global_constant_uniform_block_intel:
   case nir_intrinsic_load_debug_log_desc_amd:
   case nir_intrinsic_load_xfb_state_address_gfx12_amd:
   case nir_intrinsic_cmat_length:
@ -364,6 +360,24 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
      is_divergent = false;
      break;

+   case nir_intrinsic_load_ubo_uniform_block_intel:
+   case nir_intrinsic_load_ssbo_uniform_block_intel:
+   case nir_intrinsic_load_shared_uniform_block_intel:
+   case nir_intrinsic_load_global_constant_uniform_block_intel:
+      if (options & (nir_divergence_across_subgroups |
+                     nir_divergence_multiple_workgroup_per_compute_subgroup)) {
+         unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
+         for (unsigned i = 0; i < num_srcs; i++) {
+            if (src_divergent(instr->src[i], state)) {
+               is_divergent = true;
+               break;
+            }
+         }
+      } else {
+         is_divergent = false;
+      }
+      break;
+
   /* This is divergent because it specifically loads sequential values into
    * successive SIMD lanes.
    */
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@ -4096,9 +4096,9 @@ distribute_src_mods = [
   (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)),
   (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))),

-   (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
+   (('fneg', ('ffma(is_used_once,nsz)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))),
   (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)),
-   (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),
+   (('fneg', ('fadd(is_used_once,nsz)', a, b)), ('fadd', ('fneg', a), ('fneg', b))),

   # Note that fmin <-> fmax.  I don't think there is a way to distribute
   # fabs() into fmin or fmax.
--- a/src/compiler/nir/nir_opt_shrink_stores.c
+++ b/src/compiler/nir/nir_opt_shrink_stores.c
@ -82,7 +82,9 @@ opt_shrink_store_instr(nir_builder *b, nir_intrinsic_instr *instr, bool shrink_i

   /* Trim the num_components stored according to the write mask. */
   unsigned write_mask = nir_intrinsic_write_mask(instr);
-   unsigned last_bit = util_last_bit(write_mask);
+   /* Don't trim down to an invalid number of components, though. */
+   unsigned last_bit = nir_round_up_components(util_last_bit(write_mask));
+
   if (last_bit < instr->num_components) {
      nir_def *def = nir_trim_vector(b, instr->src[0].ssa, last_bit);
      nir_src_rewrite(&instr->src[0], def);
--- a/src/compiler/nir/nir_precompiled.h
+++ b/src/compiler/nir/nir_precompiled.h
@ -652,6 +652,7 @@ nir_precompiled_build_variant(const nir_function *libfunc,

   assert(libfunc->workgroup_size[0] != 0 && "must set workgroup size");

+   b.shader->info.workgroup_size_variable = false;
   b.shader->info.workgroup_size[0] = libfunc->workgroup_size[0];
   b.shader->info.workgroup_size[1] = libfunc->workgroup_size[1];
   b.shader->info.workgroup_size[2] = libfunc->workgroup_size[2];
--- a/src/gallium/drivers/r300/ci/r300-rs740-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rs740-fails.txt
@ -1236,44 +1236,6 @@ spec@nv_texture_env_combine4@nv_texture_env_combine4-combine,Fail

 spec@oes_texture_float@oes_texture_float half,Fail

-# Remaining fallout from 9d359c6d10adb1cd2978a0e13714a3f98544aae8
-spec@arb_texture_compression@fbo-generatemipmap-formats,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA NPOT,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@gen-compressed-teximage,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT NPOT,Fail
-
 # uprev Piglit in Mesa
 spec@!opengl 1.1@teximage-scale-bias,Fail
 spec@glsl-1.10@execution@glsl-fs-texture2d-mipmap-const-bias-01,Fail
--- a/src/gallium/drivers/r300/ci/r300-rv410-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rv410-fails.txt
@ -1280,44 +1280,6 @@ spec@nv_texture_env_combine4@nv_texture_env_combine4-combine,Fail

 spec@oes_texture_float@oes_texture_float half,Fail

-# Remaining fallout from 9d359c6d10adb1cd2978a0e13714a3f98544aae8
-spec@arb_texture_compression@fbo-generatemipmap-formats,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA NPOT,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@gen-compressed-teximage,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT NPOT,Fail
-
 # uprev Piglit in Mesa
 spec@!opengl 1.1@teximage-scale-bias,Fail
 spec@glsl-1.10@execution@glsl-fs-texture2d-mipmap-const-bias-01,Fail
--- a/src/gallium/drivers/r300/ci/r300-rv530-nohiz-fails.txt
+++ b/src/gallium/drivers/r300/ci/r300-rv530-nohiz-fails.txt
@ -778,78 +778,6 @@ dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_nearest_repeat,Fail
 dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_nearest_mirror,Fail
 dEQP-GLES2.functional.texture.mipmap.2d.projected.linear_nearest_clamp,Fail

-# Remaining fallout from 9d359c6d10adb1cd2978a0e13714a3f98544aae8
-spec@arb_texture_compression@fbo-generatemipmap-formats,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGB NPOT,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA,Fail
-spec@arb_texture_compression@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA NPOT,Fail
-spec@ati_texture_compression_3dc@fbo-generatemipmap-formats,Fail
-spec@ati_texture_compression_3dc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI,Fail
-spec@ati_texture_compression_3dc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_ALPHA_LATC2_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_ALPHA_LATC2_EXT NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_LATC1_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_LUMINANCE_LATC1_EXT NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_LATC2_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_ALPHA_LATC2_EXT NPOT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_LATC1_EXT,Fail
-spec@ext_texture_compression_latc@fbo-generatemipmap-formats@GL_COMPRESSED_LUMINANCE_LATC1_EXT NPOT,Fail
-spec@ext_texture_compression_rgtc@compressedteximage gl_compressed_red_green_rgtc2_ext,Fail
-spec@ext_texture_compression_rgtc@compressedteximage gl_compressed_signed_red_green_rgtc2_ext,Fail
-spec@ext_texture_compression_rgtc@compressedteximage gl_compressed_signed_red_rgtc1_ext,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RED_RGTC1,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RED_RGTC1 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RG_RGTC2,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats-signed@GL_COMPRESSED_SIGNED_RG_RGTC2 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED_RGTC1,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RED_RGTC1 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG_RGTC2,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG_RGTC2 NPOT,Fail
-spec@ext_texture_compression_rgtc@fbo-generatemipmap-formats@GL_COMPRESSED_RG NPOT,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_rgba_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt3_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_alpha_s3tc_dxt5_ext,Fail
-spec@ext_texture_compression_s3tc@compressedteximage gl_compressed_srgb_s3tc_dxt1_ext,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGBA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_compression_s3tc@fbo-generatemipmap-formats@GL_COMPRESSED_RGB_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_compression_s3tc@gen-compressed-teximage,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT NPOT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT,Fail
-spec@ext_texture_srgb@fbo-generatemipmap-formats-s3tc@GL_COMPRESSED_SRGB_S3TC_DXT1_EXT NPOT,Fail
-
 # uprev Piglit in Mesa
 spec@!opengl 1.1@teximage-scale-bias,Fail
 spec@ext_framebuffer_multisample@accuracy all_samples color depthstencil linear,Fail
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@ -947,6 +947,32 @@ r300_set_framebuffer_state(struct pipe_context* pipe,
    util_framebuffer_init(pipe, state, r300->fb_cbufs, &r300->fb_zsbuf);
    util_copy_framebuffer_state(r300->fb_state.state, state);

+    /* DXTC blits require that blocks are 2x1 or 4x1 pixels, but
+     * pipe_surface_width sets the framebuffer width as if blocks were 1x1
+     * pixels. Override the width to correct that.
+     */
+    if (state->nr_cbufs == 1 && state->cbufs[0].texture &&
+        state->cbufs[0].format == PIPE_FORMAT_R8G8B8A8_UNORM &&
+        util_format_is_compressed(state->cbufs[0].texture->format)) {
+        struct pipe_framebuffer_state *fb =
+            (struct pipe_framebuffer_state*)r300->fb_state.state;
+        const struct util_format_description *desc =
+            util_format_description(state->cbufs[0].texture->format);
+        unsigned width = u_minify(state->cbufs[0].texture->width0,
+                                  state->cbufs[0].level);
+
+        assert(desc->block.width == 4 && desc->block.height == 4);
+
+        /* Each 64-bit DXT block is 2x1 pixels, and each 128-bit DXT
+         * block is 4x1 pixels when blitting.
+         */
+        width = align(width, 4); /* align to the DXT block width. */
+        if (desc->block.bits == 64)
+           width = DIV_ROUND_UP(width, 2);
+
+        fb->width = width;
+    }
+
    /* Remove trailing NULL colorbuffers. */
    while (current_state->nr_cbufs && !current_state->cbufs[current_state->nr_cbufs-1].texture)
        current_state->nr_cbufs--;
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@ -201,6 +201,7 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 	rctx->b.set_vertex_buffers(&rctx->b, 1, &vbuffer);
 	util_draw_arrays_instanced(&rctx->b, R600_PRIM_RECTANGLE_LIST, 0, 3,
 				   0, num_instances);
+	pipe_resource_reference(&buf, NULL);
 }

 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@ -14,6 +14,7 @@
 #include "util/u_memory.h"
 #include "util/u_pack_color.h"
 #include "util/u_surface.h"
+#include "util/u_resource.h"
 #include "util/os_time.h"
 #include "frontend/winsys_handle.h"
 #include <errno.h>
@ -442,7 +443,7 @@ static bool r600_texture_get_param(struct pipe_screen *screen,

 	switch (param) {
 	case PIPE_RESOURCE_PARAM_NPLANES:
-		*value = 1;
+		*value = util_resource_num(resource);
 		return true;

 	case PIPE_RESOURCE_PARAM_STRIDE:
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@ -56,6 +56,19 @@ static bool si_update_shaders(struct si_context *sctx)
   struct si_shader *old_ps = sctx->shader.ps.current;
   int r;

+   if (GFX_VERSION >= GFX9) {
+      /* For merged shaders, mark the next shader as dirty so its previous_stage is updated. */
+      if (is_vs_state_changed) {
+         if (HAS_TESS) {
+            is_tess_state_changed = true;
+         } else if (HAS_GS) {
+            is_gs_state_changed = true;
+         }
+      }
+      if ((sctx->dirty_shaders_mask & BITFIELD_BIT(MESA_SHADER_TESS_EVAL)) && HAS_GS && HAS_TESS)
+         is_gs_state_changed = true;
+   }
+
   /* Update TCS and TES. */
   if (HAS_TESS && is_tess_state_changed) {
      if (!sctx->has_tessellation) {
--- a/src/gallium/drivers/v3d/v3d_blit.c
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@ -690,6 +690,7 @@ v3d_get_sand8_fs(struct pipe_context *pctx, int cpp)
                nir_variable_create(b.shader, nir_var_shader_out,
                                    vec4, "f_color");
        color_out->data.location = FRAG_RESULT_COLOR;
+        b.shader->info.outputs_written |= BITFIELD_BIT(FRAG_RESULT_COLOR);

        nir_variable *pos_in =
                nir_variable_create(b.shader, nir_var_shader_in, vec4, "pos");
@ -998,6 +999,7 @@ v3d_get_sand30_fs(struct pipe_context *pctx)
                                                      nir_var_shader_out,
                                                      glsl_uvec4, "f_color");
        color_out->data.location = FRAG_RESULT_COLOR;
+        b.shader->info.outputs_written |= BITFIELD_BIT(FRAG_RESULT_COLOR);

        nir_variable *pos_in =
                nir_variable_create(b.shader, nir_var_shader_in, vec4, "pos");
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@ -3368,9 +3368,8 @@ begin_rendering(struct zink_context *ctx, bool check_msaa_expand)
      VK_TRUE,
      ctx->gfx_pipeline_state.rast_samples + 1,
   };
+   ctx->dynamic_fb.info.pNext = ctx->transient_attachments && !ctx->blitting && has_msrtss ? &msrtss : NULL;

-   if (has_msrtss && !ctx->blitting)
-      ctx->dynamic_fb.info.pNext = ctx->transient_attachments ? &msrtss : NULL;
   VKCTX(CmdBeginRendering)(ctx->bs->cmdbuf, &ctx->dynamic_fb.info);
   ctx->in_rp = true;
   return clear_buffers;
--- a/src/gallium/winsys/radeon/drm/meson.build
+++ b/src/gallium/winsys/radeon/drm/meson.build
@ -1,6 +1,14 @@
 # Copyright © 2017 Dylan Baker
 # SPDX-License-Identifier: MIT

+libradeonwinsys_deps = [idep_mesautil, dep_libdrm]
+libradeonwinsys_c_args = []
+
+if with_gallium_radeonsi
+  libradeonwinsys_deps += [idep_amdgfxregs_h]
+  libradeonwinsys_c_args = ['-DHAVE_GALLIUM_RADEONSI']
+endif
+
 libradeonwinsys = static_library(
  'radeonwinsys',
  files('radeon_drm_bo.c',
@ -14,5 +22,6 @@ libradeonwinsys = static_library(
        'radeon_surface.h'),
  include_directories : [inc_src, inc_include, inc_gallium, inc_gallium_aux],
  gnu_symbol_visibility : 'hidden',
-  dependencies : [idep_mesautil, dep_libdrm],
+  c_args : libradeonwinsys_c_args,
+  dependencies : libradeonwinsys_deps,
 )
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@ -8,6 +8,10 @@
 #include "radeon_drm_bo.h"
 #include "radeon_drm_cs.h"

+#ifdef HAVE_GALLIUM_RADEONSI
+#include "amdgfxregs.h"
+#endif
+
 #include "util/os_file.h"
 #include "util/simple_mtx.h"
 #include "util/thread_sched.h"
@ -105,6 +109,73 @@ static bool radeon_get_drm_value(int fd, unsigned request,
   return true;
 }

+static void get_hs_info(struct radeon_info *info)
+{
+   /* This is the size of all TCS outputs in memory per workgroup.
+    * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K.
+    */
+   unsigned max_hs_out_vram_dwords_per_wg = info->family == CHIP_HAWAII ? 4096 : 8192;
+   unsigned max_workgroups_per_se;
+
+#ifdef HAVE_GALLIUM_RADEONSI /* for gfx6+ register definitions */
+   unsigned max_hs_out_vram_dwords_enum = 0;
+
+   switch (max_hs_out_vram_dwords_per_wg) {
+   case 8192:
+      max_hs_out_vram_dwords_enum = V_03093C_X_8K_DWORDS;
+      break;
+   case 4096:
+      max_hs_out_vram_dwords_enum = V_03093C_X_4K_DWORDS;
+      break;
+   case 2048:
+      max_hs_out_vram_dwords_enum = V_03093C_X_2K_DWORDS;
+      break;
+   case 1024:
+      max_hs_out_vram_dwords_enum = V_03093C_X_1K_DWORDS;
+      break;
+   default:
+      UNREACHABLE("invalid TCS workgroup size");
+   }
+#endif
+
+   /* Gfx7 should limit num_workgroups to 508 (127 per SE)
+    * Gfx6 should limit num_workgroups to 126 (63 per SE)
+    */
+   if (info->gfx_level == GFX7) {
+      max_workgroups_per_se = 127;
+   } else {
+      max_workgroups_per_se = 63;
+   }
+
+   /* Limit to 4 workgroups per CU for TCS, which exhausts LDS if each workgroup occupies 16KB.
+    * Note that the offchip allocation isn't deallocated until the corresponding TES waves finish.
+    */
+   unsigned num_offchip_wg_per_cu = 4;
+   unsigned num_workgroups_per_se = MIN2(num_offchip_wg_per_cu * info->max_good_cu_per_sa *
+                                         info->max_sa_per_se, max_workgroups_per_se);
+   unsigned num_workgroups = num_workgroups_per_se * info->max_se;
+
+#ifdef HAVE_GALLIUM_RADEONSI /* for gfx6+ register definitions */
+   if (info->gfx_level == GFX7) {
+      info->hs_offchip_param = S_03093C_OFFCHIP_BUFFERING_GFX7(num_workgroups) |
+                               S_03093C_OFFCHIP_GRANULARITY_GFX7(max_hs_out_vram_dwords_enum);
+   } else {
+      info->hs_offchip_param = S_0089B0_OFFCHIP_BUFFERING(num_workgroups) |
+                               S_0089B0_OFFCHIP_GRANULARITY(max_hs_out_vram_dwords_enum);
+   }
+#endif
+
+   /* The typical size of tess factors of 1 TCS workgroup if all patches are triangles. */
+   unsigned typical_tess_factor_size_per_wg = (192 / 3) * 16;
+   unsigned num_tess_factor_wg_per_cu = 3;
+
+   info->hs_offchip_workgroup_dw_size = max_hs_out_vram_dwords_per_wg;
+   info->tess_offchip_ring_size = num_workgroups * max_hs_out_vram_dwords_per_wg * 4;
+   info->tess_factor_ring_size = typical_tess_factor_size_per_wg * num_tess_factor_wg_per_cu *
+                                 info->max_good_cu_per_sa * info->max_sa_per_se * info->max_se;
+   info->total_tess_ring_size = info->tess_offchip_ring_size + info->tess_factor_ring_size;
+}
+
 /* Helper function to do the ioctls needed for setup and init. */
 static bool do_winsys_init(struct radeon_drm_winsys *ws)
 {
@ -639,6 +710,9 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws)
   default:;
   }

+   if (ws->gen == DRV_SI)
+      get_hs_info(&ws->info);
+
   ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL ||
                                                                            strstr(debug_get_option("AMD_DEBUG", ""), "check_vm") != NULL;
   ws->noop_cs = debug_get_bool_option("RADEON_NOOP", false);
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@ -2181,7 +2181,11 @@ brw_postprocess_nir_opts(nir_shader *nir, const struct brw_compiler *compiler,
   if (OPT(nir_lower_tex, &tex_options))
      OPT(nir_lower_tex, &tex_options);

-   OPT(brw_nir_lower_mcs_fetch, devinfo);
+   /* MCS lowering can introduce u2u16 conversions. We need to lower those to
+    * make constant offsets detectable by brw_nir_texture_backend_opcode().
+    */
+   if (OPT(brw_nir_lower_mcs_fetch, devinfo))
+      OPT(nir_opt_constant_folding);

   const struct brw_nir_lower_texture_opts brw_tex_options = {
      .combined_lod_and_array_index = compiler->devinfo->ver >= 20,
--- a/src/intel/compiler/brw/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw/brw_schedule_instructions.cpp
@ -1128,7 +1128,25 @@ has_cross_lane_access(const brw_inst *inst)

   for (unsigned s = 0; s < inst->sources; s++) {
      if (inst->src[s].file == VGRF) {
-         if (inst->src[s].stride == 0)
+         /* The instruction reads a particular lane (only relevant with non
+          * scalar values, otherwise this is just the way we read uniform
+          * values produced in reduced SIMD size).
+          */
+         if (!inst->src[s].is_scalar && inst->src[s].stride == 0)
+            return true;
+      } else if (inst->src[s].file == ARF &&
+                 inst->src[s].nr >= BRW_ARF_FLAG &&
+                 inst->src[s].nr < BRW_ARF_MASK) {
+         /* The instruction reads the flag register which represents states
+          * from all the lanes.
+          *
+          * Note that although this prevents moving instructions reading the
+          * flag registers past a HALT kind of instruction, this doesn't
+          * prevent the instructions that generated the flag value from moving
+          * on either side of the HALT instruction. So it's possible for
+          * ballot instructions to produce incorrect values when used in a
+          * shader with HALT.
+          */
         return true;
      }
   }
--- a/src/intel/vulkan/anv_image_host_copy.c
+++ b/src/intel/vulkan/anv_image_host_copy.c
@ -480,6 +480,9 @@ anv_CopyImageToMemory(
   return VK_SUCCESS;
 }

+/* This functions copies from one image to another through an intermediate
+ * linear buffer.
+ */
 static void
 copy_image_to_image(struct anv_device *device,
                    struct anv_image *src_image,
@ -505,14 +508,11 @@ copy_image_to_image(struct anv_device *device,
   isl_surf_get_tile_info(src_surf, &src_tile);
   isl_surf_get_tile_info(dst_surf, &dst_tile);

-   uint32_t tile_width_B;
   uint32_t tile_width_el, tile_height_el;
   if (src_tile.phys_extent_B.w > dst_tile.phys_extent_B.w) {
-      tile_width_B   = src_tile.phys_extent_B.w;
      tile_width_el  = src_tile.logical_extent_el.w;
      tile_height_el = src_tile.logical_extent_el.h;
   } else {
-      tile_width_B   = dst_tile.phys_extent_B.w;
      tile_width_el  = dst_tile.logical_extent_el.w;
      tile_height_el = dst_tile.logical_extent_el.h;
   }
@ -527,14 +527,18 @@ copy_image_to_image(struct anv_device *device,
   VkExtent3D extent_el =
      vk_extent3d_to_el(src_surf->format, region->extent);

+   uint32_t linear_stride_B;
   /* linear-to-linear case */
   if (tile_width_el == 1 && tile_height_el == 1) {
      tile_width_el = MIN2(4096 / (src_tile.format_bpb / 8),
                           extent_el.width);
      tile_height_el = 4096 / (tile_width_el * (src_tile.format_bpb / 8));
-      tile_width_B = tile_width_el * src_tile.format_bpb / 8;
+      linear_stride_B = tile_width_el * src_tile.format_bpb / 8;
+   } else {
+      linear_stride_B = src_tile.logical_extent_el.w * src_tile.format_bpb / 8;
   }

+
   uint32_t layer_count =
      vk_image_subresource_layer_count(&src_image->vk, &region->srcSubresource);
   for (uint32_t a = 0; a < layer_count; a++) {
@ -559,7 +563,7 @@ copy_image_to_image(struct anv_device *device,
                                     src_binding,
                                     src_anv_surf->memory_range.offset,
                                     tmp_map,
-                                     tile_width_B,
+                                     linear_stride_B,
                                     &src_offset, &extent,
                                     region->srcSubresource.mipLevel,
                                     region->srcSubresource.baseArrayLayer,
@ -570,7 +574,7 @@ copy_image_to_image(struct anv_device *device,
                                     dst_binding,
                                     dst_anv_surf->memory_range.offset,
                                     tmp_map,
-                                     tile_width_B,
+                                     linear_stride_B,
                                     &dst_offset, &extent,
                                     region->dstSubresource.mipLevel,
                                     region->dstSubresource.baseArrayLayer,
--- a/src/nouveau/compiler/nak/sm80_instr_latencies.rs
+++ b/src/nouveau/compiler/nak/sm80_instr_latencies.rs
@ -588,8 +588,9 @@ impl RegLatencySM80 {
                | FP16 | FP16_Alu | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 3),
                Clmad => pred(has_pred, 5, 3),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
-                MMA_2x_collect => pred(has_pred, 12, 1),
+                IMMA_88 => pred(has_pred, 8, 1),
+                MMA_1x_collect => pred(has_pred, 11, 1),
+                MMA_2x_collect => pred(has_pred, 19, 1),
                DMMA => pred(has_pred, 20, 1),
                Cbu => 1,
                Decoupled => 1,
@ -603,8 +604,9 @@ impl RegLatencySM80 {
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 1),
                Clmad => pred(has_pred, 5, 1),
-                IMMA_88 | MMA_1x_collect => 8,
-                MMA_2x_collect => 12,
+                IMMA_88 => 8,
+                MMA_1x_collect => 11,
+                MMA_2x_collect => 19,
                DMMA => 20,
                Cbu => 1,
                Decoupled => 1,
@ -620,8 +622,9 @@ impl RegLatencySM80 {
                IMADWideWriteDH => pred(has_pred, 1, 1),
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 3),
                Clmad => pred(has_pred, 5, 3),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
-                MMA_2x_collect => pred(has_pred, 12, 1),
+                IMMA_88 => pred(has_pred, 8, 1),
+                MMA_1x_collect => pred(has_pred, 11, 1),
+                MMA_2x_collect => pred(has_pred, 19, 1),
                DMMA => pred(has_pred, 20, 1),
                Cbu => 1,
                Decoupled => 1,
@ -639,8 +642,9 @@ impl RegLatencySM80 {
                FP16 | FP16_Alu | FP16_F32 => pred(has_pred, 1, 2),
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 5, 3),
                Clmad => pred(has_pred, 5, 5),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 3),
-                MMA_2x_collect => pred(has_pred, 12, 3),
+                IMMA_88 => pred(has_pred, 8, 3),
+                MMA_1x_collect => pred(has_pred, 11, 3),
+                MMA_2x_collect => pred(has_pred, 19, 3),
                DMMA => pred(has_pred, 20, 3),
                Cbu => 1,
                Decoupled => 1,
@ -657,8 +661,9 @@ impl RegLatencySM80 {
                | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 5, 1),
                Clmad => pred(has_pred, 5, 3),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
-                MMA_2x_collect => pred(has_pred, 12, 1),
+                IMMA_88 => pred(has_pred, 8, 1),
+                MMA_1x_collect => pred(has_pred, 11, 1),
+                MMA_2x_collect => pred(has_pred, 19, 1),
                DMMA => pred(has_pred, 20, 1),
                Cbu => 1,
                Decoupled => 1,
@ -675,8 +680,9 @@ impl RegLatencySM80 {
                | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 3),
                Clmad => pred(has_pred, 5, 3),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 8, 1),
-                MMA_2x_collect => pred(has_pred, 12, 1),
+                IMMA_88 => pred(has_pred, 8, 1),
+                MMA_1x_collect => pred(has_pred, 11, 1),
+                MMA_2x_collect => pred(has_pred, 19, 1),
                DMMA => pred(has_pred, 20, 1),
                Cbu => 1,
                Decoupled => 1,
@ -690,8 +696,9 @@ impl RegLatencySM80 {
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 => 1,
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 3, 2),
                Clmad => pred(has_pred, 5, 2),
-                IMMA_88 | MMA_1x_collect => 8,
-                MMA_2x_collect => 12,
+                IMMA_88 => 8,
+                MMA_1x_collect => 11,
+                MMA_2x_collect => 19,
                DMMA => 20,
                Cbu => 1,
                Decoupled => 1,
@ -706,8 +713,9 @@ impl RegLatencySM80 {
                HFMA2_MMA => 2,
                RedirectedFP64 => 3,
                Clmad => pred(has_pred, 5, 1),
-                IMMA_88 | MMA_1x_collect => 8,
-                MMA_2x_collect => 12,
+                IMMA_88 => 8,
+                MMA_1x_collect => 11,
+                MMA_2x_collect => 19,
                DMMA => 20,
                Cbu => 1,
                Decoupled => 1,
@ -722,8 +730,9 @@ impl RegLatencySM80 {
                HFMA2_MMA => 2,
                RedirectedFP64 => 2,
                Clmad => pred(has_pred, 4, 2),
-                IMMA_88 | MMA_1x_collect => 7,
-                MMA_2x_collect => 11,
+                IMMA_88 => 7,
+                MMA_1x_collect => 10,
+                MMA_2x_collect => 18,
                DMMA => 19,
                Cbu => 1,
                Decoupled => 1,
@ -736,8 +745,9 @@ impl RegLatencySM80 {
                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
                | RedirectedFP64 | Clmad => 2,
-                IMMA_88 | MMA_1x_collect => 7,
-                MMA_2x_collect => 11,
+                IMMA_88 => 7,
+                MMA_1x_collect => 10,
+                MMA_2x_collect => 18,
                DMMA => 19,
                Cbu => 1,
                Decoupled => 1,
@ -750,8 +760,9 @@ impl RegLatencySM80 {
                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
                | RedirectedFP64 | Clmad => 2,
-                IMMA_88 | MMA_1x_collect => 4,
-                MMA_2x_collect => 8,
+                IMMA_88 => 4,
+                MMA_1x_collect => 8,
+                MMA_2x_collect => 16,
                DMMA => 17,
                Cbu => 1,
                Decoupled => 1,
@ -764,8 +775,9 @@ impl RegLatencySM80 {
                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
                | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
                | RedirectedFP64 | Clmad => 2,
-                IMMA_88 | MMA_1x_collect => 4,
-                MMA_2x_collect => 8,
+                IMMA_88 => 4,
+                MMA_1x_collect => 8,
+                MMA_2x_collect => 16,
                DMMA => 16,
                Cbu => 1,
                Decoupled => 1,
@ -781,8 +793,9 @@ impl RegLatencySM80 {
                }
                HFMA2_MMA | RedirectedFP64 => pred(has_pred, 1, 9),
                Clmad => pred(has_pred, 1, 11),
-                IMMA_88 | MMA_1x_collect => pred(has_pred, 7, 6),
-                MMA_2x_collect => pred(has_pred, 11, 6),
+                IMMA_88 => pred(has_pred, 7, 6),
+                MMA_1x_collect => pred(has_pred, 10, 5),
+                MMA_2x_collect => pred(has_pred, 18, 5),
                DMMA => pred(has_pred, 19, 6),
                Cbu => 1,
                Decoupled => 1,
@ -801,15 +814,25 @@ impl RegLatencySM80 {
        use RegLatencySM80::*;
        match writer {
            CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideWriteDL
-            | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA
-            | RedirectedFP64 => match reader {
+            | IMADWideWriteDH | FP16 | FP16_Alu | FP16_F32 | HFMA2_MMA => {
+                match reader {
                    MMA_2x_collect => 7,
                    _ => 1,
-            },
-            Clmad | IMMA_88 | MMA_1x_collect | MMA_2x_collect | DMMA | Cbu
+                }
+            }
+            RedirectedFP64 => 1,
+            Clmad | IMMA_88 | MMA_1x_collect | MMA_2x_collect | DMMA
            | Decoupled | DecoupledAgu => match reader {
                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideReadAB
-                | IMADWideReadCL | IMADWideReadCH => 2,
+                | IMADWideReadCL | IMADWideReadCH | FP16 | FP16_Alu
+                | FP16_F32 | HFMA2_MMA => 2,
+                _ => 1,
+            },
+            Cbu => match reader {
+                CoupledAlu | CoupledDisp64 | CoupledFMA | IMADWideReadAB
+                | IMADWideReadCL | IMADWideReadCH | FP16 | FP16_Alu
+                | FP16_F32 | HFMA2_MMA => 2,
+                MMA_2x_collect => 7,
                _ => 1,
            },
            _ => {
--- a/src/nouveau/compiler/nak_nir_lower_cmat.c
+++ b/src/nouveau/compiler/nak_nir_lower_cmat.c
@ -671,6 +671,43 @@ try_lower_cmat_load_to_ldsm(nir_builder *b, nir_intrinsic_instr *intr)
                                     .matrix_layout = layout);
 }

+static nir_deref_instr*
+get_cmat_component_deref(nir_builder *b, nir_intrinsic_instr *intr,
+                         nir_def *lane_id, unsigned idx)
+{
+   unsigned deref_src = intr->intrinsic == nir_intrinsic_cmat_store ? 0 : 1;
+   unsigned cmat_src = intr->intrinsic == nir_intrinsic_cmat_store ? 1 : 0;
+
+   const struct glsl_cmat_description desc = cmat_src_desc(intr->src[cmat_src]);
+   nir_deref_instr *deref = nir_def_as_deref(intr->src[deref_src].ssa);
+
+   const enum glsl_matrix_layout layout = nir_intrinsic_matrix_layout(intr);
+   nir_def *stride = intr->src[2].ssa;
+
+   nir_def *col_offset;
+   nir_def *row_offset;
+   compute_matrix_offsets(b, desc, layout, lane_id, idx,
+                              &col_offset, &row_offset);
+
+   row_offset = nir_imul(b, row_offset, stride);
+   col_offset = nir_u2uN(b, col_offset, deref->def.bit_size);
+   row_offset = nir_u2uN(b, row_offset, deref->def.bit_size);
+
+   /* We have to ignore the incoming stride, but have to choose the type of
+    * the pointer as the declared stride is in multiple of the pointer type */
+   deref = nir_build_deref_cast(
+      b, &deref->def, deref->modes,
+      deref->type,
+      glsl_get_vector_elements(deref->type) * glsl_get_bit_size(deref->type) / 8
+   );
+   deref = nir_build_deref_ptr_as_array(b, deref, row_offset);
+   deref = nir_build_deref_cast(
+      b, &deref->def, deref->modes,
+      glsl_scalar_type(desc.element_type),
+      glsl_base_type_bit_size(desc.element_type) / 8);
+   return nir_build_deref_ptr_as_array(b, deref, col_offset);
+}
+
 static void
 lower_cmat_load(nir_builder *b, nir_intrinsic_instr *intr)
 {
@ -682,10 +719,6 @@ lower_cmat_load(nir_builder *b, nir_intrinsic_instr *intr)

   const struct glsl_cmat_description desc = cmat_src_desc(intr->src[0]);
   const unsigned length = get_cmat_length(desc);
-   const enum glsl_matrix_layout layout = nir_intrinsic_matrix_layout(intr);
-
-   nir_deref_instr *deref = nir_def_as_deref(intr->src[1].ssa);
-   nir_def *stride = intr->src[2].ssa;

   nir_def *vars[NIR_MAX_VEC_COMPONENTS];
   for (unsigned i = 0; i < length; ++i)
@ -694,26 +727,8 @@ lower_cmat_load(nir_builder *b, nir_intrinsic_instr *intr)
   nir_def *lane_id = nir_load_subgroup_invocation(b);

   for (unsigned idx = 0; idx < length; idx++) {
-      nir_def *col_offset;
-      nir_def *row_offset;
-
-      compute_matrix_offsets(b, desc, layout, lane_id, idx,
-                              &col_offset, &row_offset);
-
-      row_offset = nir_imul(b, row_offset, stride);
-
-      col_offset = nir_u2uN(b, col_offset, deref->def.bit_size);
-      row_offset = nir_u2uN(b, row_offset, deref->def.bit_size);
-
      nir_deref_instr *iter_deref =
-         nir_build_deref_ptr_as_array(b, deref, row_offset);
-      iter_deref = nir_build_deref_cast(
-         b, &iter_deref->def, deref->modes,
-         glsl_scalar_type(desc.element_type),
-         glsl_base_type_bit_size(desc.element_type) / 8);
-      iter_deref =
-         nir_build_deref_ptr_as_array(b, iter_deref, col_offset);
-
+         get_cmat_component_deref(b, intr, lane_id, idx);
      vars[idx] = nir_load_deref(b, iter_deref);
   }

@ -764,11 +779,6 @@ lower_cmat_instr(nir_builder *b,
   }

   case nir_intrinsic_cmat_store: {
-      enum glsl_matrix_layout layout = nir_intrinsic_matrix_layout(intr);
-
-      nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
-      nir_def *stride = intr->src[2].ssa;
-
      const struct glsl_cmat_description desc = cmat_src_desc(intr->src[1]);
      const unsigned length = get_cmat_length(desc);
      nir_def *src = load_cmat_src(b, intr->src[1]);
@ -780,26 +790,8 @@ lower_cmat_instr(nir_builder *b,
      nir_def *lane_id = nir_load_subgroup_invocation(b);

      for (unsigned idx = 0; idx < length; idx++) {
-         nir_def *col_offset;
-         nir_def *row_offset;
-
-         compute_matrix_offsets(b, desc, layout, lane_id, idx,
-                                &col_offset, &row_offset);
-
-         row_offset = nir_imul(b, row_offset, stride);
-
-         col_offset = nir_u2uN(b, col_offset, deref->def.bit_size);
-         row_offset = nir_u2uN(b, row_offset, deref->def.bit_size);
-
         nir_deref_instr *iter_deref =
-            nir_build_deref_ptr_as_array(b, deref, row_offset);
-         iter_deref = nir_build_deref_cast(
-            b, &iter_deref->def, deref->modes,
-            glsl_scalar_type(desc.element_type),
-            glsl_base_type_bit_size(desc.element_type) / 8);
-         iter_deref =
-            nir_build_deref_ptr_as_array(b, iter_deref, col_offset);
-
+            get_cmat_component_deref(b, intr, lane_id, idx);
         nir_store_deref(b, iter_deref, vars[idx], 1);
      }

--- a/src/nouveau/vulkan/nvk_cmd_buffer.c
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.c
@ -531,6 +531,21 @@ nvk_cmd_flush_wait_dep(struct nvk_cmd_buffer *cmd,
 {
   enum nvk_barrier barriers = 0;

+   /* For asymmetric, we don't know what the access flags will be yet.
+    * Handle this by setting access to everything.
+    */
+   if (dep->dependencyFlags & VK_DEPENDENCY_ASYMMETRIC_EVENT_BIT_KHR) {
+      /* VUID-vkCmdSetEvent2-dependencyFlags-10785, 10786, 10787 */
+      assert(dep->memoryBarrierCount == 1 &&
+             dep->bufferMemoryBarrierCount == 0 &&
+             dep->imageMemoryBarrierCount == 0);
+
+      const VkMemoryBarrier2 *bar = &dep->pMemoryBarriers[0];
+      barriers |= nvk_barrier_flushes_waits(bar->srcStageMask,
+                                            VK_ACCESS_2_MEMORY_READ_BIT |
+                                            VK_ACCESS_2_MEMORY_WRITE_BIT);
+   }
+
   for (uint32_t i = 0; i < dep->memoryBarrierCount; i++) {
      const VkMemoryBarrier2 *bar = &dep->pMemoryBarriers[i];
      barriers |= nvk_barrier_flushes_waits(bar->srcStageMask,
--- a/src/nouveau/vulkan/nvk_descriptor_set_layout.c
+++ b/src/nouveau/vulkan/nvk_descriptor_set_layout.c
@ -474,9 +474,6 @@ nvk_GetDescriptorSetLayoutSupport(VkDevice device,
         assert(stride <= UINT8_MAX);
         assert(util_is_power_of_two_nonzero(alignment));

-         variable_is_inline_uniform_block =
-            binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
-
         if (flags & VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT) {
            /* From the Vulkan 1.3.256 spec:
             *
@ -486,6 +483,9 @@ nvk_GetDescriptorSetLayoutSupport(VkDevice device,
             */
            variable_count = MAX2(1, binding->descriptorCount);
            variable_stride = stride;
+
+            variable_is_inline_uniform_block =
+               binding->descriptorType == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK;
         } else {
            /* Since we're aligning to the maximum and since this is just a
             * check for whether or not the max buffer size is big enough, we
@ -507,8 +507,6 @@ nvk_GetDescriptorSetLayoutSupport(VkDevice device,
   if (pCreateInfo->flags &
       VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR)
      max_buffer_size = NVK_MAX_PUSH_DESCRIPTORS * nvk_max_descriptor_size(&pdev->info);
-   else if (variable_is_inline_uniform_block)
-      max_buffer_size = NVK_MAX_INLINE_UNIFORM_BLOCK_SIZE;
   else
      max_buffer_size = NVK_MAX_DESCRIPTOR_SET_SIZE;

@ -519,12 +517,21 @@ nvk_GetDescriptorSetLayoutSupport(VkDevice device,
      switch (ext->sType) {
      case VK_STRUCTURE_TYPE_DESCRIPTOR_SET_VARIABLE_DESCRIPTOR_COUNT_LAYOUT_SUPPORT: {
         VkDescriptorSetVariableDescriptorCountLayoutSupport *vs = (void *)ext;
+         uint32_t max_var_count;
+
         if (variable_stride > 0) {
-            vs->maxVariableDescriptorCount =
+            max_var_count =
               (max_buffer_size - non_variable_size) / variable_stride;
         } else {
-            vs->maxVariableDescriptorCount = 0;
+            max_var_count = 0;
         }
+
+         if (variable_is_inline_uniform_block) {
+            max_var_count =
+               MIN2(max_var_count, NVK_MAX_INLINE_UNIFORM_BLOCK_SIZE);
+         }
+
+         vs->maxVariableDescriptorCount = max_var_count;
         break;
      }

--- a/src/panfrost/clc/pan_compile.c
+++ b/src/panfrost/clc/pan_compile.c
@ -100,6 +100,12 @@ compile(void *memctx, const uint32_t *spirv, size_t spirv_size, unsigned arch)
   nir_shader *nir =
      spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
                   "library", &spirv_options, nir_options);
+   /* Workgroup size may be different between different entrypoints, so we
+    * mark it as variable to prevent it from being lowered to a constant while
+    * we are still processing all entrypoints together. This is tempoary,
+    * nir_precompiled_build_variant will set the fixed workgroup size for each
+    * entrypoint and set workgroup_size_variable back to false. */
+   nir->info.workgroup_size_variable = true;
   nir_validate_shader(nir, "after spirv_to_nir");
   nir_validate_ssa_dominance(nir, "after spirv_to_nir");
   ralloc_steal(memctx, nir);
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@ -2557,7 +2557,8 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
   uint32_t patch_attribs =
      cmdbuf->state.gfx.vi.attribs_changing_on_base_instance;
   uint32_t vs_res_table_size =
-      panvk_shader_res_table_count(&cmdbuf->state.gfx.vs.desc);
+      panvk_shader_res_table_count(&cmdbuf->state.gfx.vs.desc) *
+      pan_size(RESOURCE);
   bool patch_faus = shader_uses_sysval(vs, graphics, vs.first_vertex) ||
                     shader_uses_sysval(vs, graphics, vs.base_instance);
   struct cs_index draw_params_addr = cs_scratch_reg64(b, 0);
@ -2583,6 +2584,9 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
   if (patch_faus)
      cs_move64_to(b, vs_fau_addr, cmdbuf->state.gfx.vs.push_uniforms);

+   if (patch_attribs != 0)
+      cs_move64_to(b, vs_drv_set, vs_desc_state->driver_set.dev_addr);
+
   cs_move64_to(b, draw_params_addr, draw->indirect.buffer_dev_addr);
   cs_move32_to(b, draw_id, 0);

@ -2610,8 +2614,6 @@ panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
      }

      if (patch_attribs != 0) {
-         cs_move64_to(b, vs_drv_set, vs_desc_state->driver_set.dev_addr);
-
         /* If firstInstance=0, skip the offset adjustment. */
         cs_if(b, MALI_CS_CONDITION_NEQUAL,
               cs_sr_reg32(b, IDVS, INSTANCE_OFFSET)) {
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c
@ -922,12 +922,11 @@ panvk_per_arch(get_physical_device_properties)(
         MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS,
      .maxInlineUniformTotalSize =
         MAX_INLINE_UNIFORM_BLOCK_DESCRIPTORS * MAX_INLINE_UNIFORM_BLOCK_SIZE,
-      .integerDotProduct8BitUnsignedAccelerated = true,
-      .integerDotProduct8BitSignedAccelerated = true,
+      .integerDotProduct8BitUnsignedAccelerated = false,
+      .integerDotProduct8BitSignedAccelerated = false,
      .integerDotProduct8BitMixedSignednessAccelerated = false,
-      .integerDotProduct4x8BitPackedUnsignedAccelerated = true,
-      .integerDotProduct4x8BitPackedSignedAccelerated = true,
-      .integerDotProduct4x8BitPackedSignedAccelerated = false,
+      .integerDotProduct4x8BitPackedUnsignedAccelerated = PAN_ARCH >= 9,
+      .integerDotProduct4x8BitPackedSignedAccelerated = PAN_ARCH >= 9,
      .integerDotProduct16BitUnsignedAccelerated = false,
      .integerDotProduct16BitSignedAccelerated = false,
      .integerDotProduct16BitMixedSignednessAccelerated = false,
@ -940,8 +939,8 @@ panvk_per_arch(get_physical_device_properties)(
      .integerDotProductAccumulatingSaturating8BitUnsignedAccelerated = false,
      .integerDotProductAccumulatingSaturating8BitSignedAccelerated = false,
      .integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated = false,
-      .integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = false,
-      .integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = false,
+      .integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated = PAN_ARCH >= 9,
+      .integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated = PAN_ARCH >= 9,
      .integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated = false,
      .integerDotProductAccumulatingSaturating16BitUnsignedAccelerated = false,
      .integerDotProductAccumulatingSaturating16BitSignedAccelerated = false,
--- a/src/util/cache_ops.h
+++ b/src/util/cache_ops.h
@ -28,6 +28,7 @@
 #include <stddef.h>

 #include "detect_arch.h"
+#include "u_cpu_detect.h"

 #ifdef __cplusplus
 extern "C" {
@ -44,7 +45,13 @@ util_has_cache_ops(void)
   return false;
 #endif

-   return DETECT_ARCH_X86 || DETECT_ARCH_X86_64 || DETECT_ARCH_AARCH64;
+#if DETECT_ARCH_X86
+   return util_get_cpu_caps()->has_sse2;
+#elif DETECT_ARCH_X86_64 || DETECT_ARCH_AARCH64
+   return true;
+#else
+   return false;
+#endif
 }

 /** Returns the cache granularity
--- a/src/util/meson.build
+++ b/src/util/meson.build
@ -188,7 +188,14 @@ libmesa_util_links = []
 if host_machine.cpu_family() == 'aarch64' and cc.get_id() != 'msvc'
  files_mesa_util += files('cache_ops_aarch64.c')
 elif host_machine.cpu_family() in ['x86', 'x86_64'] and cc.get_id() != 'msvc'
-  files_mesa_util += files('cache_ops_x86.c')
+  libmesa_util_clflush = static_library(
+    'mesa_util_clflush',
+    ['cache_ops_x86.c'],
+    include_directories : [inc_util],
+    c_args : [no_override_init_args, sse2_args],
+    gnu_symbol_visibility : 'hidden',
+  )
+  libmesa_util_links += [libmesa_util_clflush]
  if with_clflushopt
    libmesa_util_clflushopt = static_library(
      'mesa_util_clflushopt',
@ -197,7 +204,7 @@ elif host_machine.cpu_family() in ['x86', 'x86_64'] and cc.get_id() != 'msvc'
      c_args : [no_override_init_args] + clflushopt_args,
      gnu_symbol_visibility : 'hidden',
    )
-    libmesa_util_links += libmesa_util_clflushopt
+    libmesa_util_links += [libmesa_util_clflushopt]
  endif
 else
  files_mesa_util += files('cache_ops_null.c')
--- a/src/vulkan/wsi/wsi_common_drm.c
+++ b/src/vulkan/wsi/wsi_common_drm.c
@ -757,6 +757,7 @@ wsi_create_native_image_mem(const struct wsi_swapchain *chain,
       * handling implict sync ourselves.
       */
      .implicit_sync = !info->explicit_sync && !chain->dma_buf_semaphore,
+      .dma_buf_sync_file = chain->dma_buf_semaphore,
   };
   const VkExportMemoryAllocateInfo memory_export_info = {
      .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
--- a/src/vulkan/wsi/wsi_common_private.h
+++ b/src/vulkan/wsi/wsi_common_private.h
@ -201,9 +201,6 @@ struct wsi_swapchain {
   VkAllocationCallbacks alloc;
   VkFence* fences;
   VkPresentModeKHR present_mode;
-   VkPresentGravityFlagsEXT present_gravity_x;
-   VkPresentGravityFlagsEXT present_gravity_y;
-
   /**
    * Timeline for presents completing according to VK_KHR_present_wait.  The
    * present should complete as close as possible (before or after!) to the
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@ -925,6 +925,7 @@ struct Colorspace {
   enum wp_color_manager_v1_primaries primaries;
   enum wp_color_manager_v1_transfer_function tf;
   bool should_use_hdr_metadata;
+   bool needs_extended_range;
 };
 struct Colorspace colorspace_mapping[] = {
   {
@ -932,48 +933,56 @@ struct Colorspace colorspace_mapping[] = {
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_SRGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_SRGB,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_DISPLAY_P3_NONLINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_DISPLAY_P3,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_SRGB,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_EXTENDED_SRGB_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_SRGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = true,
+      .needs_extended_range = true,
   },
   {
      .colorspace = VK_COLOR_SPACE_DISPLAY_P3_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_DISPLAY_P3,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_BT709_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_SRGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_BT709_NONLINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_SRGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_BT1886,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_BT2020_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_BT2020,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_HDR10_ST2084_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_BT2020,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_ST2084_PQ,
      .should_use_hdr_metadata = true,
+      .needs_extended_range = false,
   },
   /* VK_COLOR_SPACE_DOLBYVISION_EXT is left out because it's deprecated */
   {
@ -981,12 +990,14 @@ struct Colorspace colorspace_mapping[] = {
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_BT2020,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_HLG,
      .should_use_hdr_metadata = true,
+      .needs_extended_range = false,
   },
   {
      .colorspace = VK_COLOR_SPACE_ADOBERGB_LINEAR_EXT,
      .primaries = WP_COLOR_MANAGER_V1_PRIMARIES_ADOBE_RGB,
      .tf = WP_COLOR_MANAGER_V1_TRANSFER_FUNCTION_EXT_LINEAR,
      .should_use_hdr_metadata = false,
+      .needs_extended_range = false,
   },
   /* VK_COLOR_SPACE_ADOBERGB_NONLINEAR_EXT is left out because there's no
    * exactly matching transfer function in the Wayland protocol */
@ -1033,6 +1044,9 @@ wsi_wl_display_determine_colorspaces(struct wsi_wl_display *display)
         continue;
      if (!vector_contains(tfs, colorspace_mapping[i].tf))
         continue;
+      if (!display->color_features.extended_target_volume &&
+          colorspace_mapping[i].needs_extended_range)
+         continue;
      VkColorSpaceKHR *new_cs = u_vector_add(&display->colorspaces);
      if (!new_cs)
         return -1;
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@ -810,10 +810,11 @@ x11_surface_get_capabilities2(VkIcdSurfaceBase *icd_surface,
      }

      case VK_STRUCTURE_TYPE_SURFACE_PRESENT_SCALING_CAPABILITIES_EXT: {
+         /* Unsupported. */
         VkSurfacePresentScalingCapabilitiesEXT *scaling = (void *)ext;
-         scaling->supportedPresentScaling = VK_PRESENT_SCALING_ONE_TO_ONE_BIT_EXT;
-         scaling->supportedPresentGravityX = VK_PRESENT_GRAVITY_MIN_BIT_EXT | VK_PRESENT_GRAVITY_MAX_BIT_EXT | VK_PRESENT_GRAVITY_CENTERED_BIT_EXT;
-         scaling->supportedPresentGravityY = VK_PRESENT_GRAVITY_MIN_BIT_EXT | VK_PRESENT_GRAVITY_MAX_BIT_EXT | VK_PRESENT_GRAVITY_CENTERED_BIT_EXT;
+         scaling->supportedPresentScaling = 0;
+         scaling->supportedPresentGravityX = 0;
+         scaling->supportedPresentGravityY = 0;
         scaling->minScaledImageExtent = caps->surfaceCapabilities.minImageExtent;
         scaling->maxScaledImageExtent = caps->surfaceCapabilities.maxImageExtent;
         break;
@ -1458,46 +1459,7 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
         .serial = serial,
      };

-   int16_t x_off = 0;
-   int16_t y_off = 0;
-
-   xcb_get_geometry_reply_t *geometry =
-      xcb_get_geometry_reply(chain->conn, xcb_get_geometry(chain->conn, chain->window), NULL);
-
-   if (geometry) {
-      switch (chain->base.present_gravity_x) {
-      case VK_PRESENT_GRAVITY_MIN_BIT_EXT:
-         x_off = 0;
-         break;
-      case VK_PRESENT_GRAVITY_MAX_BIT_EXT:
-         x_off = geometry->width - chain->extent.width;
-         break;
-      case VK_PRESENT_GRAVITY_CENTERED_BIT_EXT:
-         x_off = (geometry->width / 2) - (chain->extent.width / 2);
-         break;
-      default:
-         x_off = 0;
-      }
-
-      switch (chain->base.present_gravity_y) {
-      case VK_PRESENT_GRAVITY_MIN_BIT_EXT:
-         y_off = 0;
-         break;
-      case VK_PRESENT_GRAVITY_MAX_BIT_EXT:
-         y_off = geometry->height - chain->extent.height;
-         break;
-      case VK_PRESENT_GRAVITY_CENTERED_BIT_EXT:
-         y_off = (geometry->height / 2) - (chain->extent.height / 2);
-         break;
-      default:
-         y_off = 0;
-      }
-
-      free(geometry);
-   }
-
   xcb_void_cookie_t cookie;
-
 #ifdef HAVE_DRI3_EXPLICIT_SYNC
   if (chain->base.image_info.explicit_sync) {
      uint64_t acquire_point = image->base.explicit_sync[WSI_ES_ACQUIRE].timeline;
@ -1509,8 +1471,8 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
         serial,
         0,                                   /* valid */
         image->update_area,                  /* update */
-         x_off,                               /* x_off */
-         y_off,                               /* y_off */
+         0,                                   /* x_off */
+         0,                                   /* y_off */
         XCB_NONE,                            /* target_crtc */
         image->dri3_syncobj[WSI_ES_ACQUIRE], /* acquire_syncobj */
         image->dri3_syncobj[WSI_ES_RELEASE], /* release_syncobj */
@ -1529,8 +1491,8 @@ x11_present_to_x11_dri3(struct x11_swapchain *chain, uint32_t image_index,
                                  serial,
                                  0,                  /* valid */
                                  image->update_area, /* update */
-                                  x_off,              /* x_off */
-                                  y_off,              /* y_off */
+                                  0,                  /* x_off */
+                                  0,                  /* y_off */
                                  XCB_NONE,           /* target_crtc */
                                  XCB_NONE,
                                  image->sync_fence,
@ -2788,14 +2750,6 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
   chain->has_mit_shm = wsi_conn->has_mit_shm;
   chain->has_async_may_tear = present_caps & XCB_PRESENT_CAPABILITY_ASYNC_MAY_TEAR;

-   const VkSwapchainPresentScalingCreateInfoEXT* scaling_info =
-      vk_find_struct_const(pCreateInfo->pNext, SWAPCHAIN_PRESENT_SCALING_CREATE_INFO_EXT);
-
-   if (scaling_info) {
-      chain->base.present_gravity_x = scaling_info->presentGravityX;
-      chain->base.present_gravity_y = scaling_info->presentGravityY;
-   }
-
   /* When images in the swapchain don't fit the window, X can still present them, but it won't
    * happen by flip, only by copy. So this is a suboptimal copy, because if the client would change
    * the chain extents X may be able to flip