nvk: properly calculate SLM region by taking per arch limits into account

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24326>
2026-05-06 05:08:08 +02:00 · 2023-07-20 22:50:16 +02:00 · 2023-07-20 22:50:16 +02:00 · 61c0d86831
commit 61c0d86831
parent 4b66a0a70c
3 changed files with 59 additions and 8 deletions
--- a/src/nouveau/vulkan/nvk_device.c
+++ b/src/nouveau/vulkan/nvk_device.c
@ -60,16 +60,13 @@ nvk_slm_area_ensure(struct nvk_device *dev,
    */
   bytes_per_warp = ALIGN(bytes_per_warp, 0x200);

-   uint64_t bytes_per_tpc = bytes_per_warp * 64; /* max warps */
+   uint64_t bytes_per_mp = bytes_per_warp * dev->ws_dev->max_warps_per_mp;
+   uint64_t bytes_per_tpc = bytes_per_mp * dev->ws_dev->mp_per_tpc;

   /* The hardware seems to require this alignment for
    * NVA0C0_SET_SHADER_LOCAL_MEMORY_NON_THROTTLED_A_SIZE_LOWER.
-    *
-    * Fortunately, this is just the alignment for bytes_per_warp multiplied
-    * by the number of warps, 64.  It might matter for real on a GPU with 48
-    * warps but we don't support any of those yet.
    */
-   assert(bytes_per_tpc == ALIGN(bytes_per_tpc, 0x8000));
+   bytes_per_tpc = ALIGN(bytes_per_tpc, 0x8000);

   /* nvk_slm_area::bytes_per_mp only ever increases so we can check this
    * outside the lock and exit early in the common case.  We only need to
--- a/src/nouveau/winsys/nouveau_device.c
+++ b/src/nouveau/winsys/nouveau_device.c
@ -75,6 +75,52 @@ sm_for_chipset(uint16_t chipset)
   return 0x00;
 }

+static uint8_t
+max_warps_per_mp_for_sm(uint8_t sm)
+{
+   switch (sm) {
+   case 10:
+   case 11:
+      return 24;
+   case 12:
+   case 13:
+   case 75:
+      return 32;
+   case 20:
+   case 21:
+   case 86:
+   case 87:
+   case 89:
+      return 48;
+   case 30:
+   case 32:
+   case 35:
+   case 37:
+   case 50:
+   case 52:
+   case 53:
+   case 60:
+   case 61:
+   case 62:
+   case 70:
+   case 72:
+   case 80:
+   case 90:
+      return 64;
+   default:
+      assert(!"unkown SM version");
+   }
+}
+
+static uint8_t
+mp_per_tpc_for_chipset(uint16_t chipset)
+{
+   // GP100 is special and has two, otherwise it's a Volta and newer thing to have two
+   if (chipset == 0x130 || chipset >= 0x140)
+      return 2;
+   return 1;
+}
+
 static void
 nouveau_ws_device_set_dbg_flags(struct nouveau_ws_device *dev)
 {
@ -256,8 +302,9 @@ nouveau_ws_device_new(drmDevicePtr drm_device)

   if (nouveau_ws_param(fd, NOUVEAU_GETPARAM_GRAPH_UNITS, &value))
      goto out_err;
-   device->gpc_count = value & 0x000000ff;
-   device->tpc_count = value >> 8;
+
+   device->gpc_count = (value >> 0) & 0x000000ff;
+   device->tpc_count = (value >> 8) & 0x0000ffff;

   nouveau_ws_device_set_dbg_flags(device);

@ -272,6 +319,11 @@ nouveau_ws_device_new(drmDevicePtr drm_device)
   device->info.cls_m2mf = tmp_ctx->m2mf.cls;
   device->info.cls_compute = tmp_ctx->compute.cls;

+   // for now we hardcode those values, but in the future Nouveau could provide that information to
+   // us instead.
+   device->max_warps_per_mp = max_warps_per_mp_for_sm(device->info.sm);
+   device->mp_per_tpc = mp_per_tpc_for_chipset(device->info.chipset);
+
   nouveau_ws_context_destroy(tmp_ctx);

   simple_mtx_init(&device->bos_lock, mtx_plain);
--- a/src/nouveau/winsys/nouveau_device.h
+++ b/src/nouveau/winsys/nouveau_device.h
@ -38,6 +38,8 @@ struct nouveau_ws_device {
   uint32_t local_mem_domain;

   uint8_t gpc_count;
+   uint8_t mp_per_tpc;
+   uint8_t max_warps_per_mp;
   uint16_t tpc_count;

   enum nvk_debug debug_flags;