i965: Calculate thread_count in brw_alloc_stage_scratch

Previously, thread_count was sent in from the stage after some stage specific calculations. Those stage specific calculations were moved into brw_alloc_stage_scratch, which will allow the shader cache to also use the same calculations. Signed-off-by: Jordan Justen <jordan.l.justen@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2026-05-08 15:38:09 +02:00 · 2017-10-31 00:34:32 -07:00 · 2017-10-31 00:34:32 -07:00 · f9d5a7add4
commit f9d5a7add4
parent f082d7f64f
8 changed files with 62 additions and 45 deletions
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@ -1345,8 +1345,7 @@ void brw_get_scratch_bo(struct brw_context *brw,
 			struct brw_bo **scratch_bo, int size);
 void brw_alloc_stage_scratch(struct brw_context *brw,
                             struct brw_stage_state *stage_state,
-                             unsigned per_thread_size,
-                             unsigned thread_count);
+                             unsigned per_thread_size);
 void brw_init_shader_time(struct brw_context *brw);
 int brw_get_shader_time_index(struct brw_context *brw,
                              struct gl_program *prog,
--- a/src/mesa/drivers/dri/i965/brw_cs.c
+++ b/src/mesa/drivers/dri/i965/brw_cs.c
@ -114,29 +114,7 @@ brw_codegen_cs_prog(struct brw_context *brw,
      }
   }

-   const unsigned subslices = MAX2(brw->screen->subslice_total, 1);
-
-   /* WaCSScratchSize:hsw
-    *
-    * Haswell's scratch space address calculation appears to be sparse
-    * rather than tightly packed.  The Thread ID has bits indicating
-    * which subslice, EU within a subslice, and thread within an EU
-    * it is.  There's a maximum of two slices and two subslices, so these
-    * can be stored with a single bit.  Even though there are only 10 EUs
-    * per subslice, this is stored in 4 bits, so there's an effective
-    * maximum value of 16 EUs.  Similarly, although there are only 7
-    * threads per EU, this is stored in a 3 bit number, giving an effective
-    * maximum value of 8 threads per EU.
-    *
-    * This means that we need to use 16 * 8 instead of 10 * 7 for the
-    * number of threads per subslice.
-    */
-   const unsigned scratch_ids_per_subslice =
-      devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
-
-   brw_alloc_stage_scratch(brw, &brw->cs.base,
-                           prog_data.base.total_scratch,
-                           scratch_ids_per_subslice * subslices);
+   brw_alloc_stage_scratch(brw, &brw->cs.base, prog_data.base.total_scratch);

   /* The param and pull_param arrays will be freed by the shader cache. */
   ralloc_steal(NULL, prog_data.base.param);
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@ -138,8 +138,7 @@ brw_codegen_gs_prog(struct brw_context *brw,

   /* Scratch space is used for register spilling */
   brw_alloc_stage_scratch(brw, stage_state,
-                           prog_data.base.base.total_scratch,
-                           devinfo->max_gs_threads);
+                           prog_data.base.base.total_scratch);

   /* The param and pull_param arrays will be freed by the shader cache. */
   ralloc_steal(NULL, prog_data.base.base.param);
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@ -328,19 +328,65 @@ brw_get_scratch_bo(struct brw_context *brw,
 void
 brw_alloc_stage_scratch(struct brw_context *brw,
                        struct brw_stage_state *stage_state,
-                        unsigned per_thread_size,
-                        unsigned thread_count)
+                        unsigned per_thread_size)
 {
-   if (stage_state->per_thread_scratch < per_thread_size) {
-      stage_state->per_thread_scratch = per_thread_size;
+   if (stage_state->per_thread_scratch >= per_thread_size)
+      return;

-      if (stage_state->scratch_bo)
-         brw_bo_unreference(stage_state->scratch_bo);
+   stage_state->per_thread_scratch = per_thread_size;

-      stage_state->scratch_bo =
-         brw_bo_alloc(brw->bufmgr, "shader scratch space",
-                      per_thread_size * thread_count, 4096);
+   if (stage_state->scratch_bo)
+      brw_bo_unreference(stage_state->scratch_bo);
+
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   unsigned thread_count;
+   switch(stage_state->stage) {
+   case MESA_SHADER_VERTEX:
+      thread_count = devinfo->max_vs_threads;
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      thread_count = devinfo->max_tcs_threads;
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      thread_count = devinfo->max_tes_threads;
+      break;
+   case MESA_SHADER_GEOMETRY:
+      thread_count = devinfo->max_gs_threads;
+      break;
+   case MESA_SHADER_FRAGMENT:
+      thread_count = devinfo->max_wm_threads;
+      break;
+   case MESA_SHADER_COMPUTE: {
+      const unsigned subslices = MAX2(brw->screen->subslice_total, 1);
+
+      /* WaCSScratchSize:hsw
+       *
+       * Haswell's scratch space address calculation appears to be sparse
+       * rather than tightly packed.  The Thread ID has bits indicating
+       * which subslice, EU within a subslice, and thread within an EU
+       * it is.  There's a maximum of two slices and two subslices, so these
+       * can be stored with a single bit.  Even though there are only 10 EUs
+       * per subslice, this is stored in 4 bits, so there's an effective
+       * maximum value of 16 EUs.  Similarly, although there are only 7
+       * threads per EU, this is stored in a 3 bit number, giving an effective
+       * maximum value of 8 threads per EU.
+       *
+       * This means that we need to use 16 * 8 instead of 10 * 7 for the
+       * number of threads per subslice.
+       */
+      const unsigned scratch_ids_per_subslice =
+         devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
+
+      thread_count = scratch_ids_per_subslice * subslices;
+      break;
   }
+   default:
+      unreachable("Unsupported stage!");
+   }
+
+   stage_state->scratch_bo =
+      brw_bo_alloc(brw->bufmgr, "shader scratch space",
+                   per_thread_size * thread_count, 4096);
 }

 void brwInitFragProgFuncs( struct dd_function_table *functions )
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@ -259,8 +259,7 @@ brw_codegen_tcs_prog(struct brw_context *brw, struct brw_program *tcp,

   /* Scratch space is used for register spilling */
   brw_alloc_stage_scratch(brw, stage_state,
-                           prog_data.base.base.total_scratch,
-                           devinfo->max_tcs_threads);
+                           prog_data.base.base.total_scratch);

   /* The param and pull_param arrays will be freed by the shader cache. */
   ralloc_steal(NULL, prog_data.base.base.param);
--- a/src/mesa/drivers/dri/i965/brw_tes.c
+++ b/src/mesa/drivers/dri/i965/brw_tes.c
@ -129,8 +129,7 @@ brw_codegen_tes_prog(struct brw_context *brw,

   /* Scratch space is used for register spilling */
   brw_alloc_stage_scratch(brw, stage_state,
-                           prog_data.base.base.total_scratch,
-                           devinfo->max_tes_threads);
+                           prog_data.base.base.total_scratch);

   /* The param and pull_param arrays will be freed by the shader cache. */
   ralloc_steal(NULL, prog_data.base.base.param);
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@ -248,8 +248,7 @@ brw_codegen_vs_prog(struct brw_context *brw,

   /* Scratch space is used for register spilling */
   brw_alloc_stage_scratch(brw, &brw->vs.base,
-                           prog_data.base.base.total_scratch,
-                           devinfo->max_vs_threads);
+                           prog_data.base.base.total_scratch);

   /* The param and pull_param arrays will be freed by the shader cache. */
   ralloc_steal(NULL, prog_data.base.base.param);
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@ -209,9 +209,7 @@ brw_codegen_wm_prog(struct brw_context *brw,
      }
   }

-   brw_alloc_stage_scratch(brw, &brw->wm.base,
-                           prog_data.base.total_scratch,
-                           devinfo->max_wm_threads);
+   brw_alloc_stage_scratch(brw, &brw->wm.base, prog_data.base.total_scratch);

   if (unlikely((INTEL_DEBUG & DEBUG_WM) && fp->program.is_arb_asm))
      fprintf(stderr, "\n");