ac/info: add ac_fill_hw_info

Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Marek Olšák <marek.olsak@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40656>
2026-05-07 17:58:26 +02:00 · 2026-03-26 10:49:34 +01:00 · 2026-03-26 10:49:34 +01:00 · 2f8865035f
commit 2f8865035f
parent 8b28cdc8bd
2 changed files with 186 additions and 181 deletions
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@ -1109,6 +1109,189 @@ void ac_fill_feature_info(struct radeon_info *info, const struct drm_amdgpu_info
   }
 }

+void ac_fill_hw_info(struct radeon_info *info, const struct drm_amdgpu_info_device *device_info)
+{
+   /* convert the shader/memory clocks from KHz to MHz */
+   info->max_gpu_freq_mhz = device_info->max_engine_clock / 1000;
+   info->memory_freq_mhz_effective = info->memory_freq_mhz = device_info->max_memory_clock / 1000;
+   info->max_se = device_info->num_shader_engines;
+   info->max_sa_per_se = device_info->num_shader_arrays_per_engine;
+   info->num_cu_per_sh = device_info->num_cu_per_sh;
+   info->enabled_rb_mask = device_info->enabled_rb_pipes_mask;
+   info->enabled_rb_mask |= (uint64_t)device_info->enabled_rb_pipes_mask_hi << 32;
+
+   info->memory_freq_mhz_effective *= ac_memory_ops_per_clock(info->vram_type);
+
+   info->pa_sc_tile_steering_override = device_info->pa_sc_tile_steering_override;
+   info->max_render_backends = device_info->num_rb_pipes;
+   /* The value returned by the kernel driver was wrong. */
+   if (info->family == CHIP_KAVERI)
+      info->max_render_backends = 2;
+
+   info->clock_crystal_freq = device_info->gpu_counter_freq;
+   if (!info->clock_crystal_freq) {
+      fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
+      info->clock_crystal_freq = 1;
+   }
+
+   info->tcc_rb_non_coherent = info->gfx_level < GFX12 &&
+                               !util_is_power_of_two_or_zero(info->num_tcc_blocks) &&
+                               info->num_rb != info->num_tcc_blocks;
+   info->cp_sdma_ge_use_system_memory_scope = info->gfx_level == GFX12;
+   info->cp_dma_use_L2 = info->gfx_level >= GFX7 && !info->cp_sdma_ge_use_system_memory_scope;
+
+   info->sqc_inst_cache_size = device_info->sqc_inst_cache_size * 1024;
+   info->sqc_scalar_cache_size = device_info->sqc_data_cache_size * 1024;
+   info->num_sqc_per_wgp = device_info->num_sqc_per_wgp;
+
+   /* LDS is 64KB per CU (4 SIMDs on GFX6-9, which is 16KB per SIMD).
+    *
+    * GFX10+: LDS is 128KB in WGP mode, but a workgroup can only use up to 64KB.
+    * GFX7+:  Workgroups can use up to 64KB.
+    * GFX6:   There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
+    */
+   info->lds_size_per_workgroup = info->gfx_level >= GFX7 ? 64 * 1024 : 32 * 1024;
+
+   /* Get the number of good compute units. */
+   info->num_cu = 0;
+   for (int i = 0; i < info->max_se; i++) {
+      for (int j = 0; j < info->max_sa_per_se; j++) {
+         if (info->gfx_level >= GFX11) {
+            assert(info->max_sa_per_se <= 2);
+            info->cu_mask[i][j] = device_info->cu_bitmap[i % 4][(i / 4) * 2 + j];
+         } else if (info->family == CHIP_MI100) {
+            /* The CU bitmap in amd gpu info structure is
+             * 4x4 size array, and it's usually suitable for Vega
+             * ASICs which has 4*2 SE/SA layout.
+             * But for MI100, SE/SA layout is changed to 8*1.
+             * To mostly reduce the impact, we make it compatible
+             * with current bitmap array as below:
+             *    SE4 --> cu_bitmap[0][1]
+             *    SE5 --> cu_bitmap[1][1]
+             *    SE6 --> cu_bitmap[2][1]
+             *    SE7 --> cu_bitmap[3][1]
+             */
+            assert(info->max_sa_per_se == 1);
+            info->cu_mask[i][0] = device_info->cu_bitmap[i % 4][i / 4];
+         } else {
+            info->cu_mask[i][j] = device_info->cu_bitmap[i][j];
+         }
+         info->num_cu += util_bitcount(info->cu_mask[i][j]);
+      }
+   }
+
+   if (info->gfx_level >= GFX10_3 && info->max_se > 1) {
+      uint32_t enabled_se_mask = 0;
+
+      /* Derive the enabled SE mask from the CU mask. */
+      for (unsigned se = 0; se < info->max_se; se++) {
+         for (unsigned sa = 0; sa < info->max_sa_per_se; sa++) {
+            if (info->cu_mask[se][sa]) {
+               enabled_se_mask |= BITFIELD_BIT(se);
+               break;
+            }
+         }
+      }
+      info->num_se = util_bitcount(enabled_se_mask);
+
+      /* Trim the number of enabled RBs based on the number of enabled SEs because the RB mask
+       * might include disabled SEs.
+       */
+      if (info->gfx_level >= GFX12) {
+         unsigned num_rb_per_se = info->max_render_backends / info->max_se;
+
+         for (unsigned se = 0; se < info->max_se; se++) {
+            if (!(BITFIELD_BIT(se) & enabled_se_mask))
+               info->enabled_rb_mask &= ~(BITFIELD_MASK(num_rb_per_se) << (se * num_rb_per_se));
+         }
+      }
+   } else {
+      /* GFX10 and older always enable all SEs because they don't support SE harvesting. */
+      info->num_se = info->max_se;
+   }
+
+   info->num_rb = util_bitcount64(info->enabled_rb_mask);
+
+   /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled,
+    * and max - min <= 2.
+    */
+   unsigned cu_group = info->gfx_level >= GFX10 ? 2 : 1;
+   info->max_good_cu_per_sa =
+      DIV_ROUND_UP(info->num_cu, (info->num_se * info->max_sa_per_se * cu_group)) *
+      cu_group;
+   info->min_good_cu_per_sa =
+      (info->num_cu / (info->num_se * info->max_sa_per_se * cu_group)) * cu_group;
+
+   info->gfx_ib_pad_with_type2 = info->gfx_level == GFX6;
+
+   if (info->gfx_level >= GFX11 && info->gfx_level < GFX12) {
+      /* With num_cu = 4 in gfx11 measured power for idle, video playback and observed
+       * power savings, hence enable dcc with retile for gfx11 with num_cu >= 4.
+       */
+       info->use_display_dcc_with_retile_blit = info->num_cu >= 4;
+   } else if (info->gfx_level == GFX10_3) {
+      /* Displayable DCC with retiling is known to increase power consumption on Raphael
+       * and Mendocino, so disable it on the smallest APUs. We need a proof that
+       * displayable DCC doesn't regress bigger chips in the same way.
+       */
+      info->use_display_dcc_with_retile_blit = info->num_cu > 4;
+   } else if (info->gfx_level == GFX9 && !info->has_dedicated_vram) {
+      if (info->max_render_backends == 1) {
+         info->use_display_dcc_unaligned = true;
+      } else {
+         /* there may be power increase for small APUs with less num_cu. */
+         info->use_display_dcc_with_retile_blit = info->num_cu > 4;
+      }
+   }
+
+   if (info->gfx_level >= GFX12) {
+      /* Gfx12 doesn't use pc_lines and pbb_max_alloc_count. */
+   } else if (info->gfx_level >= GFX11) {
+      info->pc_lines = 1024;
+      info->pbb_max_alloc_count = 16; /* minimum is 2, maximum is 256 */
+   } else if (info->gfx_level >= GFX9 && info->has_graphics) {
+      unsigned pc_lines = 0;
+
+      switch (info->family) {
+      case CHIP_VEGA10:
+      case CHIP_VEGA12:
+      case CHIP_VEGA20:
+         pc_lines = 2048;
+         break;
+      case CHIP_RAVEN:
+      case CHIP_RAVEN2:
+      case CHIP_RENOIR:
+      case CHIP_NAVI10:
+      case CHIP_NAVI12:
+      case CHIP_GFX1013:
+      case CHIP_NAVI21:
+      case CHIP_NAVI22:
+      case CHIP_NAVI23:
+         pc_lines = 1024;
+         break;
+      case CHIP_NAVI14:
+      case CHIP_NAVI24:
+         pc_lines = 512;
+         break;
+      case CHIP_VANGOGH:
+      case CHIP_REMBRANDT:
+      case CHIP_RAPHAEL_MENDOCINO:
+         pc_lines = 256;
+         break;
+      default:
+         assert(0);
+      }
+
+      info->pc_lines = pc_lines;
+
+      if (info->gfx_level >= GFX10) {
+         info->pbb_max_alloc_count = pc_lines / 3;
+      } else {
+         info->pbb_max_alloc_count = MIN2(128, pc_lines / (4 * info->max_se));
+      }
+   }
+}
+
 enum ac_query_gpu_info_result
 ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
                  bool require_pci_bus_info)
@ -1117,7 +1300,7 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
   struct drm_amdgpu_info_device device_info = {0};
   uint32_t vidip_fw_version = 0, vidip_fw_feature = 0;
   uint32_t num_instances = 0;
-   int r, i, j;
+   int r;
   ac_drm_device *dev = dev_p;

   STATIC_ASSERT(AMDGPU_HW_IP_GFX == AMD_IP_GFX);
@ -1271,42 +1454,10 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,

   ac_fill_memory_info(info, &device_info, &meminfo);

-   /* Set hardware information. */
-   /* convert the shader/memory clocks from KHz to MHz */
-   info->max_gpu_freq_mhz = device_info.max_engine_clock / 1000;
-   info->memory_freq_mhz_effective = info->memory_freq_mhz = device_info.max_memory_clock / 1000;
-   info->max_se = device_info.num_shader_engines;
-   info->max_sa_per_se = device_info.num_shader_arrays_per_engine;
-   info->num_cu_per_sh = device_info.num_cu_per_sh;
-   info->enabled_rb_mask = device_info.enabled_rb_pipes_mask;
-   info->enabled_rb_mask |= (uint64_t)device_info.enabled_rb_pipes_mask_hi << 32;
-
-   info->memory_freq_mhz_effective *= ac_memory_ops_per_clock(info->vram_type);
-
   info->has_timeline_syncobj = ac_drm_device_get_sync_provider(dev)->timeline_wait != NULL;
   ac_drm_query_has_vm_always_valid(dev, info);

-   info->pa_sc_tile_steering_override = device_info.pa_sc_tile_steering_override;
-   info->max_render_backends = device_info.num_rb_pipes;
-   /* The value returned by the kernel driver was wrong. */
-   if (info->family == CHIP_KAVERI)
-      info->max_render_backends = 2;
-
-   info->clock_crystal_freq = device_info.gpu_counter_freq;
-   if (!info->clock_crystal_freq) {
-      fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
-      info->clock_crystal_freq = 1;
-   }
-
-   info->tcc_rb_non_coherent = info->gfx_level < GFX12 &&
-                               !util_is_power_of_two_or_zero(info->num_tcc_blocks) &&
-                               info->num_rb != info->num_tcc_blocks;
-   info->cp_sdma_ge_use_system_memory_scope = info->gfx_level == GFX12;
-   info->cp_dma_use_L2 = info->gfx_level >= GFX7 && !info->cp_sdma_ge_use_system_memory_scope;
-
-   info->sqc_inst_cache_size = device_info.sqc_inst_cache_size * 1024;
-   info->sqc_scalar_cache_size = device_info.sqc_data_cache_size * 1024;
-   info->num_sqc_per_wgp = device_info.num_sqc_per_wgp;
+   ac_fill_hw_info(info, &device_info);

   ac_fill_tiling_info(info, &amdinfo);

@ -1314,155 +1465,8 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,

   info->kernel_has_modifiers = has_modifiers(fd) || (info->is_virtio && fd < 0);

-   /* LDS is 64KB per CU (4 SIMDs on GFX6-9, which is 16KB per SIMD).
-    *
-    * GFX10+: LDS is 128KB in WGP mode, but a workgroup can only use up to 64KB.
-    * GFX7+:  Workgroups can use up to 64KB.
-    * GFX6:   There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
-    */
-   info->lds_size_per_workgroup = info->gfx_level >= GFX7 ? 64 * 1024 : 32 * 1024;
-
-   /* Get the number of good compute units. */
-   info->num_cu = 0;
-   for (i = 0; i < info->max_se; i++) {
-      for (j = 0; j < info->max_sa_per_se; j++) {
-         if (info->gfx_level >= GFX11) {
-            assert(info->max_sa_per_se <= 2);
-            info->cu_mask[i][j] = device_info.cu_bitmap[i % 4][(i / 4) * 2 + j];
-         } else if (info->family == CHIP_MI100) {
-            /* The CU bitmap in amd gpu info structure is
-             * 4x4 size array, and it's usually suitable for Vega
-             * ASICs which has 4*2 SE/SA layout.
-             * But for MI100, SE/SA layout is changed to 8*1.
-             * To mostly reduce the impact, we make it compatible
-             * with current bitmap array as below:
-             *    SE4 --> cu_bitmap[0][1]
-             *    SE5 --> cu_bitmap[1][1]
-             *    SE6 --> cu_bitmap[2][1]
-             *    SE7 --> cu_bitmap[3][1]
-             */
-            assert(info->max_sa_per_se == 1);
-            info->cu_mask[i][0] = device_info.cu_bitmap[i % 4][i / 4];
-         } else {
-            info->cu_mask[i][j] = device_info.cu_bitmap[i][j];
-         }
-         info->num_cu += util_bitcount(info->cu_mask[i][j]);
-      }
-   }
-
   ac_fill_bug_info(info);

-   if (info->gfx_level >= GFX10_3 && info->max_se > 1) {
-      uint32_t enabled_se_mask = 0;
-
-      /* Derive the enabled SE mask from the CU mask. */
-      for (unsigned se = 0; se < info->max_se; se++) {
-         for (unsigned sa = 0; sa < info->max_sa_per_se; sa++) {
-            if (info->cu_mask[se][sa]) {
-               enabled_se_mask |= BITFIELD_BIT(se);
-               break;
-            }
-         }
-      }
-      info->num_se = util_bitcount(enabled_se_mask);
-
-      /* Trim the number of enabled RBs based on the number of enabled SEs because the RB mask
-       * might include disabled SEs.
-       */
-      if (info->gfx_level >= GFX12) {
-         unsigned num_rb_per_se = info->max_render_backends / info->max_se;
-
-         for (unsigned se = 0; se < info->max_se; se++) {
-            if (!(BITFIELD_BIT(se) & enabled_se_mask))
-               info->enabled_rb_mask &= ~(BITFIELD_MASK(num_rb_per_se) << (se * num_rb_per_se));
-         }
-      }
-   } else {
-      /* GFX10 and older always enable all SEs because they don't support SE harvesting. */
-      info->num_se = info->max_se;
-   }
-
-   info->num_rb = util_bitcount64(info->enabled_rb_mask);
-
-   /* On GFX10, only whole WGPs (in units of 2 CUs) can be disabled,
-    * and max - min <= 2.
-    */
-   unsigned cu_group = info->gfx_level >= GFX10 ? 2 : 1;
-   info->max_good_cu_per_sa =
-      DIV_ROUND_UP(info->num_cu, (info->num_se * info->max_sa_per_se * cu_group)) *
-      cu_group;
-   info->min_good_cu_per_sa =
-      (info->num_cu / (info->num_se * info->max_sa_per_se * cu_group)) * cu_group;
-
-   info->gfx_ib_pad_with_type2 = info->gfx_level == GFX6;
-
-   if (info->gfx_level >= GFX11 && info->gfx_level < GFX12) {
-      /* With num_cu = 4 in gfx11 measured power for idle, video playback and observed
-       * power savings, hence enable dcc with retile for gfx11 with num_cu >= 4.
-       */
-       info->use_display_dcc_with_retile_blit = info->num_cu >= 4;
-   } else if (info->gfx_level == GFX10_3) {
-      /* Displayable DCC with retiling is known to increase power consumption on Raphael
-       * and Mendocino, so disable it on the smallest APUs. We need a proof that
-       * displayable DCC doesn't regress bigger chips in the same way.
-       */
-      info->use_display_dcc_with_retile_blit = info->num_cu > 4;
-   } else if (info->gfx_level == GFX9 && !info->has_dedicated_vram) {
-      if (info->max_render_backends == 1) {
-         info->use_display_dcc_unaligned = true;
-      } else {
-         /* there may be power increase for small APUs with less num_cu. */
-         info->use_display_dcc_with_retile_blit = info->num_cu > 4;
-      }
-   }
-
-   if (info->gfx_level >= GFX12) {
-      /* Gfx12 doesn't use pc_lines and pbb_max_alloc_count. */
-   } else if (info->gfx_level >= GFX11) {
-      info->pc_lines = 1024;
-      info->pbb_max_alloc_count = 16; /* minimum is 2, maximum is 256 */
-   } else if (info->gfx_level >= GFX9 && info->has_graphics) {
-      unsigned pc_lines = 0;
-
-      switch (info->family) {
-      case CHIP_VEGA10:
-      case CHIP_VEGA12:
-      case CHIP_VEGA20:
-         pc_lines = 2048;
-         break;
-      case CHIP_RAVEN:
-      case CHIP_RAVEN2:
-      case CHIP_RENOIR:
-      case CHIP_NAVI10:
-      case CHIP_NAVI12:
-      case CHIP_GFX1013:
-      case CHIP_NAVI21:
-      case CHIP_NAVI22:
-      case CHIP_NAVI23:
-         pc_lines = 1024;
-         break;
-      case CHIP_NAVI14:
-      case CHIP_NAVI24:
-         pc_lines = 512;
-         break;
-      case CHIP_VANGOGH:
-      case CHIP_REMBRANDT:
-      case CHIP_RAPHAEL_MENDOCINO:
-         pc_lines = 256;
-         break;
-      default:
-         assert(0);
-      }
-
-      info->pc_lines = pc_lines;
-
-      if (info->gfx_level >= GFX10) {
-         info->pbb_max_alloc_count = pc_lines / 3;
-      } else {
-         info->pbb_max_alloc_count = MIN2(128, pc_lines / (4 * info->max_se));
-      }
-   }
-
   /* This is the size of all TCS outputs in memory per workgroup.
    * Hawaii can't handle num_workgroups > 256 with 8K per workgroup, so use 4K.
    */
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@ -498,6 +498,7 @@ bool
 ac_identify_chip(struct radeon_info *info, const struct drm_amdgpu_info_device *device_info);
 void ac_fill_bug_info(struct radeon_info *info);
 void ac_fill_feature_info(struct radeon_info *info, const struct drm_amdgpu_info_device *device_info);
+void ac_fill_hw_info(struct radeon_info *info, const struct drm_amdgpu_info_device *device_info);

 void ac_compute_driver_uuid(char *uuid, size_t size);