nvk: Rework TLS/SLM and image/sampler table handling

When Dave originally added SLM, he did so with a preamble pushbuf which sets up the SLM area prior to executing any actual command buffers. This has the advantage that the SLM BO is never directly referenced in any command buffer and we can swap it out even after command buffers have been built. When I added image/sampler tables, I sort-of went along with it but then did something different for 3D. Thanks to buffer reference counting, we can employ a fairly simple locking scheme. Each of the globals: images, samplers, and TSL has a lock which gets taken whenever the table is modified. When we go to do a submit, we take each lock and take a queue-local reference to the BO and grab a copy of its size parameter under the lock. If anything has changed, we update the queue-local preamble pushbuf. Because the queue also holds a reference, we can safely proceed to submit command buffers which reference those global BOs without the individual global locks taken. If SLM or one of the descriptor tables resizes while we're mid-submit, our reference to the BO will remain valid until either the destruction of the queue or the next submit. The one small bit of threaded cleverness (one always has to be careful with that) is that the SLM size can be checked without taking the lock because the SLM size only ever increases. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24326>
2025-12-24 08:50:13 +01:00 · 2023-01-30 20:11:56 -06:00 · 2023-01-30 20:11:56 -06:00 · 6935fe5c68
commit 6935fe5c68
parent e8183da71f
9 changed files with 349 additions and 158 deletions
--- a/src/nouveau/vulkan/nvk_cmd_buffer.c
+++ b/src/nouveau/vulkan/nvk_cmd_buffer.c
@ -381,6 +381,7 @@ nvk_CmdBindPipeline(VkCommandBuffer commandBuffer,
 {
   VK_FROM_HANDLE(nvk_cmd_buffer, cmd, commandBuffer);
   VK_FROM_HANDLE(nvk_pipeline, pipeline, _pipeline);
+   struct nvk_device *dev = nvk_cmd_buffer_device(cmd);

   for (unsigned s = 0; s < ARRAY_SIZE(pipeline->shaders); s++) {
      if (!pipeline->shaders[s].bo)
@ -388,6 +389,9 @@ nvk_CmdBindPipeline(VkCommandBuffer commandBuffer,

      nouveau_ws_push_ref(cmd->push, pipeline->shaders[s].bo,
                          NOUVEAU_WS_BO_RD);
+
+      if (pipeline->shaders[s].slm_size)
+         nvk_device_ensure_slm(dev, pipeline->shaders[s].slm_size);
   }

   switch (pipelineBindPoint) {
--- a/src/nouveau/vulkan/nvk_cmd_dispatch.c
+++ b/src/nouveau/vulkan/nvk_cmd_dispatch.c
@ -24,33 +24,10 @@
 #define NVC3C0_QMDV02_02_VAL_SET(p,a...) NVVAL_MW_SET((p), NVC3C0, QMDV02_02, ##a)
 #define NVC3C0_QMDV02_02_DEF_SET(p,a...) NVDEF_MW_SET((p), NVC3C0, QMDV02_02, ##a)

-static uint64_t
-calc_tls_size(struct nvk_device *device,
-              uint32_t lpos, uint32_t lneg, uint32_t cstack)
-{
-   uint64_t size = (lpos + lneg) * 32 + cstack;
-
-   assert (size < (1 << 20));
-
-   size *= 64; /* max warps */
-   size  = align(size, 0x8000);
-   size *= device->pdev->dev->mp_count;
-
-   size = align(size, 1 << 17);
-   return size;
-}
-
 void
 nvk_cmd_buffer_begin_compute(struct nvk_cmd_buffer *cmd,
                             const VkCommandBufferBeginInfo *pBeginInfo)
-{
-   struct nvk_device *dev = (struct nvk_device *)cmd->vk.base.device;
-
-   if (dev->ctx->compute.cls < 0xa0c0)
-      return;
-
-   cmd->tls_space_needed = calc_tls_size(dev, 128 * 16, 0, 0x200);
-}
+{ }

 static void
 gv100_compute_setup_launch_desc(uint32_t *qmd,
--- a/src/nouveau/vulkan/nvk_cmd_draw.c
+++ b/src/nouveau/vulkan/nvk_cmd_draw.c
@ -171,7 +171,6 @@ void
 nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
                              const VkCommandBufferBeginInfo *pBeginInfo)
 {
-   struct nvk_device *dev = nvk_cmd_buffer_device(cmd);
   struct nouveau_ws_push *p = cmd->push;

   P_MTHD(p, NV9097, SET_OBJECT);
@ -215,25 +214,6 @@ nvk_cmd_buffer_begin_graphics(struct nvk_cmd_buffer *cmd,
   /* TODO: Vertex runout */
   /* TODO: temp */

-   P_IMMD(p, NV9097, SET_SHADER_LOCAL_MEMORY_WINDOW, 0xff000000); /* TODO */
-
-   nvk_push_descriptor_table_ref(p, &dev->samplers);
-   uint64_t tsp_addr = nvk_descriptor_table_base_address(&dev->samplers);
-   P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A);
-   P_NV9097_SET_TEX_SAMPLER_POOL_A(p, tsp_addr >> 32);
-   P_NV9097_SET_TEX_SAMPLER_POOL_B(p, tsp_addr);
-   P_NV9097_SET_TEX_SAMPLER_POOL_C(p, dev->samplers.alloc - 1);
-
-   nvk_push_descriptor_table_ref(p, &dev->images);
-   uint64_t thp_addr = nvk_descriptor_table_base_address(&dev->images);
-   P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A);
-   P_NV9097_SET_TEX_HEADER_POOL_A(p, thp_addr >> 32);
-   P_NV9097_SET_TEX_HEADER_POOL_B(p, thp_addr & 0xffffffff);
-   P_NV9097_SET_TEX_HEADER_POOL_C(p, dev->images.alloc - 1);
-
-   /* TODO: TIC */
-   /* TODO: TSC */
-
   P_IMMD(p, NV9097, SET_WINDOW_ORIGIN, {
      .mode    = MODE_UPPER_LEFT,
      .flip_y  = FLIP_Y_FALSE,
--- a/src/nouveau/vulkan/nvk_compute_pipeline.c
+++ b/src/nouveau/vulkan/nvk_compute_pipeline.c
@ -40,8 +40,7 @@ gv100_compute_setup_launch_desc_template(uint32_t *qmd,
   NVC3C0_QMDV02_02_VAL_SET(qmd, SHARED_MEMORY_SIZE,
                            align(shader->cp.smem_size, 0x100));
   NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_LOW_SIZE,
-                            (shader->hdr[1] & 0xfffff0) +
-                            align(shader->cp.lmem_size, 0x10));
+                            align(shader->slm_size, 0x10));
   NVC3C0_QMDV02_02_VAL_SET(qmd, SHADER_LOCAL_MEMORY_HIGH_SIZE, 0);
   NVC3C0_QMDV02_02_VAL_SET(qmd, MIN_SM_CONFIG_SHARED_MEM_SIZE,
                            gv100_sm_config_smem_size(8 * 1024));
--- a/src/nouveau/vulkan/nvk_descriptor_table.h
+++ b/src/nouveau/vulkan/nvk_descriptor_table.h
@ -41,18 +41,18 @@ void nvk_descriptor_table_free(struct nvk_device *device,
                               struct nvk_descriptor_table *table,
                               uint32_t index);

-static inline void
-nvk_push_descriptor_table_ref(struct nouveau_ws_push *push,
-                              const struct nvk_descriptor_table *table)
+static inline struct nouveau_ws_bo *
+nvk_descriptor_table_get_bo_ref(struct nvk_descriptor_table *table,
+                                uint32_t *alloc_count_out)
 {
-   if (table->bo)
-      nouveau_ws_push_ref(push, table->bo, NOUVEAU_WS_BO_RD);
-}
+   simple_mtx_lock(&table->mutex);
+   struct nouveau_ws_bo *bo = table->bo;
+   if (bo)
+      nouveau_ws_bo_ref(bo);
+   *alloc_count_out = table->alloc;
+   simple_mtx_unlock(&table->mutex);

-static inline uint64_t
-nvk_descriptor_table_base_address(const struct nvk_descriptor_table *table)
-{
-   return table->bo->offset;
+   return bo;
 }

 #endif
--- a/src/nouveau/vulkan/nvk_device.c
+++ b/src/nouveau/vulkan/nvk_device.c
@ -11,109 +11,309 @@

 #include "vulkan/wsi/wsi_common.h"

+#include "nvk_cl9097.h"
 #include "nvk_cl90b5.h"
 #include "nvk_cla0c0.h"
 #include "cla1c0.h"
 #include "nvk_clc3c0.h"

-static VkResult
-nvk_update_preamble_push(struct nvk_queue_state *qs, struct nvk_device *dev,
-                         const struct nvk_queue_alloc_info *needs)
+static void
+nvk_slm_area_init(struct nvk_slm_area *area)
 {
-   struct nouveau_ws_bo *tls_bo = qs->tls_bo;
-   VkResult result;
-   if (needs->tls_size > qs->alloc_info.tls_size) {
-      tls_bo = nouveau_ws_bo_new(dev->pdev->dev,
-                                 needs->tls_size, (1 << 17), NOUVEAU_WS_BO_LOCAL);
-      if (!tls_bo) {
-         result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
-         goto fail;
-      }
-   }
+   memset(area, 0, sizeof(*area));
+   simple_mtx_init(&area->mutex, mtx_plain);
+}

-   if (tls_bo != qs->tls_bo) {
-      if (qs->tls_bo)
-         nouveau_ws_bo_destroy(qs->tls_bo);
-      qs->tls_bo = tls_bo;
-   }
+static void
+nvk_slm_area_finish(struct nvk_slm_area *area)
+{
+   simple_mtx_destroy(&area->mutex);
+   if (area->bo)
+      nouveau_ws_bo_destroy(area->bo);
+}

-   struct nouveau_ws_push *push = nouveau_ws_push_new(dev->pdev->dev, 256);
+static struct nouveau_ws_bo *
+nvk_slm_area_get_bo_ref(struct nvk_slm_area *area,
+                        uint32_t *bytes_per_warp_out,
+                        uint32_t *bytes_per_mp_out)
+{
+   simple_mtx_lock(&area->mutex);
+   struct nouveau_ws_bo *bo = area->bo;
+   if (bo)
+      nouveau_ws_bo_ref(bo);
+   *bytes_per_warp_out = area->bytes_per_warp;
+   *bytes_per_mp_out = area->bytes_per_mp;
+   simple_mtx_unlock(&area->mutex);

-   nouveau_ws_push_ref(push, qs->tls_bo, NOUVEAU_WS_BO_RDWR);
-   P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(push, qs->tls_bo->offset >> 32);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(push, qs->tls_bo->offset & 0xffffffff);
-
-   nvk_push_descriptor_table_ref(push, &dev->samplers);
-   uint64_t tsp_addr = nvk_descriptor_table_base_address(&dev->samplers);
-   P_MTHD(push, NVA0C0, SET_TEX_SAMPLER_POOL_A);
-   P_NVA0C0_SET_TEX_SAMPLER_POOL_A(push, tsp_addr >> 32);
-   P_NVA0C0_SET_TEX_SAMPLER_POOL_B(push, tsp_addr & 0xffffffff);
-   P_NVA0C0_SET_TEX_SAMPLER_POOL_C(push, dev->samplers.alloc - 1);
-
-   nvk_push_descriptor_table_ref(push, &dev->images);
-   uint64_t thp_addr = nvk_descriptor_table_base_address(&dev->images);
-   P_MTHD(push, NVA0C0, SET_TEX_HEADER_POOL_A);
-   P_NVA0C0_SET_TEX_HEADER_POOL_A(push, thp_addr >> 32);
-   P_NVA0C0_SET_TEX_HEADER_POOL_B(push, thp_addr & 0xffffffff);
-   P_NVA0C0_SET_TEX_HEADER_POOL_C(push, dev->images.alloc - 1);
-
-   uint64_t temp_size = qs->tls_bo->size / dev->pdev->dev->mp_count;
-   P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(push, temp_size >> 32);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(push, temp_size & ~0x7fff);
-   P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(push, 0xff);
-
-   if (dev->ctx->compute.cls < 0xc3c0) {
-      P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
-      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(push, temp_size >> 32);
-      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(push, temp_size & ~0x7fff);
-      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(push, 0xff);
-
-      P_MTHD(push, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
-      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(push, 0xff << 24);
-
-      P_MTHD(push, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
-      P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(push, 0xfe << 24);
-
-      // TODO CODE_ADDRESS_HIGH
-   } else {
-      uint64_t temp = 0xfeULL << 24;
-
-      P_MTHD(push, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
-      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(push, temp >> 32);
-      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(push, temp & 0xffffffff);
-
-      temp = 0xffULL << 24;
-      P_MTHD(push, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
-      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(push, temp >> 32);
-      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(push, temp & 0xffffffff);
-   }
-
-   P_MTHD(push, NVA0C0, SET_SPA_VERSION);
-   P_NVA0C0_SET_SPA_VERSION(push, { .major = dev->ctx->compute.cls >= 0xa1c0 ? 0x4 : 0x3 });
-
-   if (qs->push)
-      nouveau_ws_push_destroy(qs->push);
-   qs->push = push;
-   return 0;
- fail:
-   return vk_error(qs, result);
+   return bo;
 }

 static VkResult
-nvk_update_preambles(struct nvk_queue_state *qs, struct nvk_device *device,
-                     struct vk_command_buffer *const *cmd_buffers, uint32_t cmd_buffer_count)
+nvk_slm_area_ensure(struct nvk_device *dev,
+                    struct nvk_slm_area *area,
+                    uint32_t bytes_per_thread)
 {
-   struct nvk_queue_alloc_info needs = { 0 };
-   for (uint32_t i = 0; i < cmd_buffer_count; i++) {
-      struct nvk_cmd_buffer *cmd = (struct nvk_cmd_buffer *)cmd_buffers[i];
-      needs.tls_size = MAX2(needs.tls_size, cmd->tls_space_needed);
+   assert(bytes_per_thread < (1 << 24));
+
+   /* TODO: Volta+doesn't use CRC */
+   const uint32_t crs_size = 0;
+
+   uint64_t bytes_per_warp = bytes_per_thread * 32 + crs_size;
+
+   /* The hardware seems to require this alignment for
+    * NV9097_SET_SHADER_LOCAL_MEMORY_E_DEFAULT_SIZE_PER_WARP
+    */
+   bytes_per_warp = ALIGN(bytes_per_warp, 0x200);
+
+   uint64_t bytes_per_mp = bytes_per_warp * 64; /* max warps */
+
+   /* The hardware seems to require this alignment for
+    * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER.
+    *
+    * Fortunately, this is just the alignment for bytes_per_warp multiplied
+    * by the number of warps, 64.  It might matter for real on a GPU with 48
+    * warps but we don't support any of those yet.
+    */
+   assert(bytes_per_mp == ALIGN(bytes_per_mp, 0x8000));
+
+   /* nvk_slm_area::bytes_per_mp only ever increases so we can check this
+    * outside the lock and exit early in the common case.  We only need to
+    * take the lock if we're actually going to resize.
+    *
+    * Also, we only care about bytes_per_mp and not bytes_per_warp because
+    * they are integer multiples of each other.
+    */
+   if (likely(bytes_per_mp <= area->bytes_per_mp))
+      return VK_SUCCESS;
+
+   uint64_t size = bytes_per_mp * dev->pdev->dev->mp_count;
+
+   struct nouveau_ws_bo *bo =
+      nouveau_ws_bo_new(dev->pdev->dev, size, 0, NOUVEAU_WS_BO_LOCAL);
+   if (bo == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   struct nouveau_ws_bo *unref_bo;
+   simple_mtx_lock(&area->mutex);
+   if (bytes_per_mp <= area->bytes_per_mp) {
+      /* We lost the race, throw away our BO */
+      assert(area->bytes_per_warp == bytes_per_warp);
+      unref_bo = bo;
+   } else {
+      unref_bo = area->bo;
+      area->bo = bo;
+      area->bytes_per_warp = bytes_per_warp;
+      area->bytes_per_mp = bytes_per_mp;
+   }
+   simple_mtx_unlock(&area->mutex);
+
+   if (unref_bo)
+      nouveau_ws_bo_destroy(unref_bo);
+
+   return VK_SUCCESS;
+}
+
+static void
+nvk_queue_state_init(struct nvk_queue_state *qs)
+{
+   memset(qs, 0, sizeof(*qs));
+}
+
+static void
+nvk_queue_state_finish(struct nvk_device *dev,
+                       struct nvk_queue_state *qs)
+{
+   if (qs->images.bo)
+      nouveau_ws_bo_destroy(qs->images.bo);
+   if (qs->samplers.bo)
+      nouveau_ws_bo_destroy(qs->samplers.bo);
+   if (qs->slm.bo)
+      nouveau_ws_bo_destroy(qs->slm.bo);
+   if (qs->push)
+      nouveau_ws_push_destroy(qs->push);
+}
+
+static void
+nvk_queue_state_ref(struct nouveau_ws_push *push,
+                    struct nvk_queue_state *qs)
+{
+   if (qs->images.bo)
+      nouveau_ws_push_ref(push, qs->images.bo, NOUVEAU_WS_BO_RD);
+   if (qs->samplers.bo)
+      nouveau_ws_push_ref(push, qs->samplers.bo, NOUVEAU_WS_BO_RD);
+   if (qs->slm.bo)
+      nouveau_ws_push_ref(push, qs->slm.bo, NOUVEAU_WS_BO_RDWR);
+}
+
+static VkResult
+nvk_queue_state_update(struct nvk_device *dev,
+                       struct nvk_queue_state *qs)
+{
+   struct nouveau_ws_bo *bo;
+   uint32_t alloc_count, bytes_per_warp, bytes_per_mp;
+   bool dirty = false;
+
+   bo = nvk_descriptor_table_get_bo_ref(&dev->images, &alloc_count);
+   if (qs->images.bo != bo || qs->images.alloc_count != alloc_count) {
+      if (qs->images.bo)
+         nouveau_ws_bo_destroy(qs->images.bo);
+      qs->images.bo = bo;
+      qs->images.alloc_count = alloc_count;
+      dirty = true;
+   } else {
+      /* No change */
+      if (bo)
+         nouveau_ws_bo_destroy(bo);
   }

-   if (needs.tls_size == qs->alloc_info.tls_size)
+   bo = nvk_descriptor_table_get_bo_ref(&dev->samplers, &alloc_count);
+   if (qs->samplers.bo != bo || qs->samplers.alloc_count != alloc_count) {
+      if (qs->samplers.bo)
+         nouveau_ws_bo_destroy(qs->samplers.bo);
+      qs->samplers.bo = bo;
+      qs->samplers.alloc_count = alloc_count;
+      dirty = true;
+   } else {
+      /* No change */
+      if (bo)
+         nouveau_ws_bo_destroy(bo);
+   }
+
+   bo = nvk_slm_area_get_bo_ref(&dev->slm, &bytes_per_warp, &bytes_per_mp);
+   if (qs->slm.bo != bo || qs->slm.bytes_per_warp != bytes_per_warp ||
+       qs->slm.bytes_per_mp != bytes_per_mp) {
+      if (qs->slm.bo)
+         nouveau_ws_bo_destroy(qs->slm.bo);
+      qs->slm.bo = bo;
+      qs->slm.bytes_per_warp = bytes_per_warp;
+      qs->slm.bytes_per_mp = bytes_per_mp;
+      dirty = true;
+   } else {
+      /* No change */
+      if (bo)
+         nouveau_ws_bo_destroy(bo);
+   }
+
+   /* TODO: We're currently depending on kernel reference counting to protect
+    * us here.  If we ever stop reference counting in the kernel, we will
+    * either need to delay destruction or hold on to our extra BO references
+    * and insert a GPU stall here if anything has changed before dropping our
+    * old references.
+    */
+
+   if (!dirty)
      return VK_SUCCESS;
-   return nvk_update_preamble_push(qs, device, &needs);
+
+   struct nouveau_ws_push *p = nouveau_ws_push_new(dev->pdev->dev, 256);
+   if (p == NULL)
+      return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
+   if (qs->images.bo) {
+      nouveau_ws_push_ref(p, qs->images.bo, NOUVEAU_WS_BO_RD);
+
+      /* Compute */
+      P_MTHD(p, NVA0C0, SET_TEX_HEADER_POOL_A);
+      P_NVA0C0_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
+      P_NVA0C0_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
+      P_NVA0C0_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
+
+      /* 3D */
+      P_MTHD(p, NV9097, SET_TEX_HEADER_POOL_A);
+      P_NV9097_SET_TEX_HEADER_POOL_A(p, qs->images.bo->offset >> 32);
+      P_NV9097_SET_TEX_HEADER_POOL_B(p, qs->images.bo->offset);
+      P_NV9097_SET_TEX_HEADER_POOL_C(p, qs->images.alloc_count - 1);
+   }
+
+   if (qs->samplers.bo) {
+      nouveau_ws_push_ref(p, qs->samplers.bo, NOUVEAU_WS_BO_RD);
+
+      /* Compute */
+      P_MTHD(p, NVA0C0, SET_TEX_SAMPLER_POOL_A);
+      P_NVA0C0_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
+      P_NVA0C0_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
+      P_NVA0C0_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
+
+      /* 3D */
+      P_MTHD(p, NV9097, SET_TEX_SAMPLER_POOL_A);
+      P_NV9097_SET_TEX_SAMPLER_POOL_A(p, qs->samplers.bo->offset >> 32);
+      P_NV9097_SET_TEX_SAMPLER_POOL_B(p, qs->samplers.bo->offset);
+      P_NV9097_SET_TEX_SAMPLER_POOL_C(p, qs->samplers.alloc_count - 1);
+   }
+
+   if (qs->slm.bo) {
+      nouveau_ws_push_ref(p, qs->slm.bo, NOUVEAU_WS_BO_RDWR);
+      const uint64_t slm_addr = qs->slm.bo->offset;
+      const uint64_t slm_size = qs->slm.bo->size;
+      const uint64_t slm_per_warp = qs->slm.bytes_per_warp;
+      const uint64_t slm_per_mp = qs->slm.bytes_per_mp;
+      assert(!(slm_per_mp & 0x7fff));
+
+      /* Compute */
+      P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_A);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
+
+      P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A(p, slm_per_mp >> 32);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_B(p, slm_per_mp);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_C(p, 0xff);
+
+      if (dev->ctx->compute.cls < VOLTA_COMPUTE_A) {
+         P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_THROTTLED_A);
+         P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_A(p, slm_per_mp >> 32);
+         P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_B(p, slm_per_mp);
+         P_NVA0C0_SET_SHADER_LOCAL_MEMORY_THROTTLED_C(p, 0xff);
+      }
+
+      /* 3D */
+      P_MTHD(p, NV9097, SET_SHADER_LOCAL_MEMORY_A);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_A(p, slm_addr >> 32);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_B(p, slm_addr);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_C(p, slm_size >> 32);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_D(p, slm_size);
+      P_NV9097_SET_SHADER_LOCAL_MEMORY_E(p, slm_per_warp);
+   }
+
+   /* We set memory windows unconditionally.  Otherwise, the memory window
+    * might be in a random place and cause us to fault off into nowhere.
+    */
+   if (dev->ctx->compute.cls >= VOLTA_COMPUTE_A) {
+      uint64_t temp = 0xfeULL << 24;
+      P_MTHD(p, NVC3C0, SET_SHADER_SHARED_MEMORY_WINDOW_A);
+      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_A(p, temp >> 32);
+      P_NVC3C0_SET_SHADER_SHARED_MEMORY_WINDOW_B(p, temp & 0xffffffff);
+
+      temp = 0xffULL << 24;
+      P_MTHD(p, NVC3C0, SET_SHADER_LOCAL_MEMORY_WINDOW_A);
+      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_A(p, temp >> 32);
+      P_NVC3C0_SET_SHADER_LOCAL_MEMORY_WINDOW_B(p, temp & 0xffffffff);
+   } else {
+      P_MTHD(p, NVA0C0, SET_SHADER_LOCAL_MEMORY_WINDOW);
+      P_NVA0C0_SET_SHADER_LOCAL_MEMORY_WINDOW(p, 0xff << 24);
+
+      P_MTHD(p, NVA0C0, SET_SHADER_SHARED_MEMORY_WINDOW);
+      P_NVA0C0_SET_SHADER_SHARED_MEMORY_WINDOW(p, 0xfe << 24);
+
+      // TODO CODE_ADDRESS_HIGH
+   }
+
+   /* From nvc0_screen.c:
+    *
+    *    "Reduce likelihood of collision with real buffers by placing the
+    *    hole at the top of the 4G area. This will have to be dealt with
+    *    for real eventually by blocking off that area from the VM."
+    *
+    * Really?!?  TODO: Fix this for realz.  Annoyingly, we only have a
+    * 32-bit pointer for this in 3D rather than a full 48 like we have for
+    * compute.
+    */
+   P_IMMD(p, NV9097, SET_SHADER_LOCAL_MEMORY_WINDOW, 0xff << 24);
+
+   if (qs->push)
+      nouveau_ws_push_destroy(qs->push);
+   qs->push = p;
+
+   return VK_SUCCESS;
 }

 static VkResult
@ -127,18 +327,17 @@ nvk_queue_init(struct nvk_device *dev, struct nvk_queue *queue,
   if (result != VK_SUCCESS)
      return result;

+   nvk_queue_state_init(&queue->state);
+
   return VK_SUCCESS;
 }

 static void
 nvk_queue_finish(struct nvk_device *dev, struct nvk_queue *queue)
 {
-   if (queue->state.push)
-      nouveau_ws_push_destroy(queue->state.push);
+   nvk_queue_state_finish(dev, &queue->state);
   if (queue->empty_push)
      nouveau_ws_push_destroy(queue->empty_push);
-   if (queue->state.tls_bo)
-      nouveau_ws_bo_destroy(queue->state.tls_bo);
   vk_queue_finish(&queue->vk);
 }

@ -155,8 +354,8 @@ nvk_queue_submit(struct vk_queue *vkqueue, struct vk_queue_submit *submission)
      P_MTHD(queue->empty_push, NV90B5, NOP);
      P_NV90B5_NOP(queue->empty_push, 0);
   }
-   result = nvk_update_preambles(&queue->state, device, submission->command_buffers,
-                                 submission->command_buffer_count);
+
+   result = nvk_queue_state_update(device, &queue->state);
   if (result != VK_SUCCESS)
      return result;

@ -184,6 +383,8 @@ nvk_queue_submit(struct vk_queue *vkqueue, struct vk_queue_submit *submission)
         nouveau_ws_push_ref(cmd->push, bo_sync->bo, NOUVEAU_WS_BO_RDWR);
      }

+      nvk_queue_state_ref(cmd->push, &queue->state);
+
      simple_mtx_lock(&device->memory_objects_lock);
      list_for_each_entry(struct nvk_device_memory, mem,
                          &device->memory_objects, link) {
@ -267,10 +468,12 @@ nvk_CreateDevice(VkPhysicalDevice physicalDevice,
   if (result != VK_SUCCESS)
      goto fail_images;

+   nvk_slm_area_init(&device->slm);
+
   result = nvk_queue_init(device, &device->queue,
                           &pCreateInfo->pQueueCreateInfos[0], 0);
   if (result != VK_SUCCESS)
-      goto fail_samplers;
+      goto fail_slm;

   if (pthread_mutex_init(&device->mutex, NULL) != 0) {
      result = vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
@ -312,7 +515,8 @@ fail_mutex:
   pthread_mutex_destroy(&device->mutex);
 fail_queue:
   nvk_queue_finish(device, &device->queue);
-fail_samplers:
+fail_slm:
+   nvk_slm_area_finish(&device->slm);
   nvk_descriptor_table_finish(device, &device->samplers);
 fail_images:
   nvk_descriptor_table_finish(device, &device->images);
@ -340,6 +544,7 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   pthread_mutex_destroy(&device->mutex);
   nvk_queue_finish(device, &device->queue);
   vk_device_finish(&device->vk);
+   nvk_slm_area_finish(&device->slm);
   nvk_descriptor_table_finish(device, &device->samplers);
   nvk_descriptor_table_finish(device, &device->images);
   assert(list_is_empty(&device->memory_objects));
@ -347,3 +552,10 @@ nvk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
   nouveau_ws_context_destroy(device->ctx);
   vk_free(&device->vk.alloc, device);
 }
+
+VkResult
+nvk_device_ensure_slm(struct nvk_device *dev,
+                      uint32_t bytes_per_thread)
+{
+   return nvk_slm_area_ensure(dev, &dev->slm, bytes_per_thread);
+}
--- a/src/nouveau/vulkan/nvk_device.h
+++ b/src/nouveau/vulkan/nvk_device.h
@ -11,13 +11,29 @@
 struct novueau_ws_context;
 struct nvk_physical_device;

-struct nvk_queue_alloc_info {
-   uint64_t tls_size;
+struct nvk_slm_area {
+   simple_mtx_t mutex;
+   struct nouveau_ws_bo *bo;
+   uint32_t bytes_per_warp;
+   uint32_t bytes_per_mp;
 };

 struct nvk_queue_state {
-   struct nvk_queue_alloc_info alloc_info;
-   struct nouveau_ws_bo *tls_bo;
+   struct {
+      struct nouveau_ws_bo *bo;
+      uint32_t alloc_count;
+   } images;
+
+   struct {
+      struct nouveau_ws_bo *bo;
+      uint32_t alloc_count;
+   } samplers;
+
+   struct {
+      struct nouveau_ws_bo *bo;
+      uint32_t bytes_per_warp;
+      uint32_t bytes_per_mp;
+   } slm;

   struct nouveau_ws_push *push;
 };
@ -41,6 +57,7 @@ struct nvk_device {

   struct nvk_descriptor_table images;
   struct nvk_descriptor_table samplers;
+   struct nvk_slm_area slm;

   struct nvk_queue queue;

@ -52,6 +69,9 @@ struct nvk_device {

 VK_DEFINE_HANDLE_CASTS(nvk_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)

+VkResult nvk_device_ensure_slm(struct nvk_device *dev,
+                               uint32_t bytes_per_thread);
+
 static struct nvk_physical_device *
 nvk_device_physical(struct nvk_device *device)
 {
--- a/src/nouveau/vulkan/nvk_shader.c
+++ b/src/nouveau/vulkan/nvk_shader.c
@ -652,7 +652,7 @@ nvk_compile_nir(struct nvk_physical_device *device, nir_shader *nir,
      assert(info_out.bin.tlsSpace < (1 << 24));
      shader->hdr[0] |= 1 << 26;
      shader->hdr[1] |= align(info_out.bin.tlsSpace, 0x10); /* l[] size */
-      shader->need_tls = true;
+      shader->slm_size = info_out.bin.tlsSpace;
   }

   if (info_out.io.globalAccess)
--- a/src/nouveau/vulkan/nvk_shader.h
+++ b/src/nouveau/vulkan/nvk_shader.h
@ -22,9 +22,9 @@ struct nvk_shader {
   uint8_t *code_ptr;
   uint32_t code_size;

-   bool need_tls;
   uint8_t num_gprs;
   uint8_t num_barriers;
+   uint32_t slm_size;

   uint32_t hdr[NVC0_MAX_SHADER_HEADER_SIZE/4];
   uint32_t flags[2];
@ -53,7 +53,6 @@ struct nvk_shader {
   } fs;

   struct {
-      uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
      uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */
      uint32_t block_size[3];
   } cp;