i965: Calculate thread_count in brw_alloc_stage_scratch

Previously, thread_count was sent in from the stage after some stage
specific calculations. Those stage specific calculations were moved
into brw_alloc_stage_scratch, which will allow the shader cache to
also use the same calculations.

Signed-off-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
This commit is contained in:
Jordan Justen 2017-10-31 00:34:32 -07:00
parent f082d7f64f
commit f9d5a7add4
8 changed files with 62 additions and 45 deletions

View file

@ -1345,8 +1345,7 @@ void brw_get_scratch_bo(struct brw_context *brw,
struct brw_bo **scratch_bo, int size);
void brw_alloc_stage_scratch(struct brw_context *brw,
struct brw_stage_state *stage_state,
unsigned per_thread_size,
unsigned thread_count);
unsigned per_thread_size);
void brw_init_shader_time(struct brw_context *brw);
int brw_get_shader_time_index(struct brw_context *brw,
struct gl_program *prog,

View file

@ -114,29 +114,7 @@ brw_codegen_cs_prog(struct brw_context *brw,
}
}
const unsigned subslices = MAX2(brw->screen->subslice_total, 1);
/* WaCSScratchSize:hsw
*
* Haswell's scratch space address calculation appears to be sparse
* rather than tightly packed. The Thread ID has bits indicating
* which subslice, EU within a subslice, and thread within an EU
* it is. There's a maximum of two slices and two subslices, so these
* can be stored with a single bit. Even though there are only 10 EUs
* per subslice, this is stored in 4 bits, so there's an effective
* maximum value of 16 EUs. Similarly, although there are only 7
* threads per EU, this is stored in a 3 bit number, giving an effective
* maximum value of 8 threads per EU.
*
* This means that we need to use 16 * 8 instead of 10 * 7 for the
* number of threads per subslice.
*/
const unsigned scratch_ids_per_subslice =
devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
brw_alloc_stage_scratch(brw, &brw->cs.base,
prog_data.base.total_scratch,
scratch_ids_per_subslice * subslices);
brw_alloc_stage_scratch(brw, &brw->cs.base, prog_data.base.total_scratch);
/* The param and pull_param arrays will be freed by the shader cache. */
ralloc_steal(NULL, prog_data.base.param);

View file

@ -138,8 +138,7 @@ brw_codegen_gs_prog(struct brw_context *brw,
/* Scratch space is used for register spilling */
brw_alloc_stage_scratch(brw, stage_state,
prog_data.base.base.total_scratch,
devinfo->max_gs_threads);
prog_data.base.base.total_scratch);
/* The param and pull_param arrays will be freed by the shader cache. */
ralloc_steal(NULL, prog_data.base.base.param);

View file

@ -328,19 +328,65 @@ brw_get_scratch_bo(struct brw_context *brw,
void
brw_alloc_stage_scratch(struct brw_context *brw,
struct brw_stage_state *stage_state,
unsigned per_thread_size,
unsigned thread_count)
unsigned per_thread_size)
{
if (stage_state->per_thread_scratch < per_thread_size) {
stage_state->per_thread_scratch = per_thread_size;
if (stage_state->per_thread_scratch >= per_thread_size)
return;
if (stage_state->scratch_bo)
brw_bo_unreference(stage_state->scratch_bo);
stage_state->per_thread_scratch = per_thread_size;
stage_state->scratch_bo =
brw_bo_alloc(brw->bufmgr, "shader scratch space",
per_thread_size * thread_count, 4096);
if (stage_state->scratch_bo)
brw_bo_unreference(stage_state->scratch_bo);
const struct gen_device_info *devinfo = &brw->screen->devinfo;
unsigned thread_count;
switch(stage_state->stage) {
case MESA_SHADER_VERTEX:
thread_count = devinfo->max_vs_threads;
break;
case MESA_SHADER_TESS_CTRL:
thread_count = devinfo->max_tcs_threads;
break;
case MESA_SHADER_TESS_EVAL:
thread_count = devinfo->max_tes_threads;
break;
case MESA_SHADER_GEOMETRY:
thread_count = devinfo->max_gs_threads;
break;
case MESA_SHADER_FRAGMENT:
thread_count = devinfo->max_wm_threads;
break;
case MESA_SHADER_COMPUTE: {
const unsigned subslices = MAX2(brw->screen->subslice_total, 1);
/* WaCSScratchSize:hsw
*
* Haswell's scratch space address calculation appears to be sparse
* rather than tightly packed. The Thread ID has bits indicating
* which subslice, EU within a subslice, and thread within an EU
* it is. There's a maximum of two slices and two subslices, so these
* can be stored with a single bit. Even though there are only 10 EUs
* per subslice, this is stored in 4 bits, so there's an effective
* maximum value of 16 EUs. Similarly, although there are only 7
* threads per EU, this is stored in a 3 bit number, giving an effective
* maximum value of 8 threads per EU.
*
* This means that we need to use 16 * 8 instead of 10 * 7 for the
* number of threads per subslice.
*/
const unsigned scratch_ids_per_subslice =
devinfo->is_haswell ? 16 * 8 : devinfo->max_cs_threads;
thread_count = scratch_ids_per_subslice * subslices;
break;
}
default:
unreachable("Unsupported stage!");
}
stage_state->scratch_bo =
brw_bo_alloc(brw->bufmgr, "shader scratch space",
per_thread_size * thread_count, 4096);
}
void brwInitFragProgFuncs( struct dd_function_table *functions )

View file

@ -259,8 +259,7 @@ brw_codegen_tcs_prog(struct brw_context *brw, struct brw_program *tcp,
/* Scratch space is used for register spilling */
brw_alloc_stage_scratch(brw, stage_state,
prog_data.base.base.total_scratch,
devinfo->max_tcs_threads);
prog_data.base.base.total_scratch);
/* The param and pull_param arrays will be freed by the shader cache. */
ralloc_steal(NULL, prog_data.base.base.param);

View file

@ -129,8 +129,7 @@ brw_codegen_tes_prog(struct brw_context *brw,
/* Scratch space is used for register spilling */
brw_alloc_stage_scratch(brw, stage_state,
prog_data.base.base.total_scratch,
devinfo->max_tes_threads);
prog_data.base.base.total_scratch);
/* The param and pull_param arrays will be freed by the shader cache. */
ralloc_steal(NULL, prog_data.base.base.param);

View file

@ -248,8 +248,7 @@ brw_codegen_vs_prog(struct brw_context *brw,
/* Scratch space is used for register spilling */
brw_alloc_stage_scratch(brw, &brw->vs.base,
prog_data.base.base.total_scratch,
devinfo->max_vs_threads);
prog_data.base.base.total_scratch);
/* The param and pull_param arrays will be freed by the shader cache. */
ralloc_steal(NULL, prog_data.base.base.param);

View file

@ -209,9 +209,7 @@ brw_codegen_wm_prog(struct brw_context *brw,
}
}
brw_alloc_stage_scratch(brw, &brw->wm.base,
prog_data.base.total_scratch,
devinfo->max_wm_threads);
brw_alloc_stage_scratch(brw, &brw->wm.base, prog_data.base.total_scratch);
if (unlikely((INTEL_DEBUG & DEBUG_WM) && fp->program.is_arb_asm))
fprintf(stderr, "\n");