mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-27 14:50:10 +01:00
i965/cs: Split out helper for building local id payload
The initial motivation for this patch was to avoid calling brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the compiler. This commit ends up refactoring things a bit more so as to split out the logic to build the local id payload to brw_fs.cpp. This moves the payload building closer to the compiler code that uses the payload layout and makes it available to other users of the compiler. Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com> Signed-off-by: Kristian Høgsberg Kristensen <krh@bitplanet.net>
This commit is contained in:
parent
4f33700f5a
commit
469d0e449b
4 changed files with 77 additions and 78 deletions
|
|
@ -484,6 +484,7 @@ struct brw_cs_prog_data {
|
|||
unsigned simd_size;
|
||||
bool uses_barrier;
|
||||
bool uses_num_work_groups;
|
||||
unsigned local_invocation_id_regs;
|
||||
|
||||
struct {
|
||||
/** @{
|
||||
|
|
|
|||
|
|
@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
|
|||
struct gl_shader_program *prog,
|
||||
unsigned *final_assembly_size);
|
||||
|
||||
unsigned
|
||||
brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
|
||||
void
|
||||
brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
|
||||
void *buffer, uint32_t threads, uint32_t stride);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
|
|||
payload.num_regs = 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* We are building the local ID push constant data using the simplest possible
|
||||
* method. We simply push the local IDs directly as they should appear in the
|
||||
* registers for the uvec3 gl_LocalInvocationID variable.
|
||||
*
|
||||
* Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
|
||||
* registers worth of push constant space.
|
||||
*
|
||||
* Note: Any updates to brw_cs_prog_local_id_payload_dwords,
|
||||
* fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
|
||||
* to coordinated.
|
||||
*
|
||||
* FINISHME: There are a few easy optimizations to consider.
|
||||
*
|
||||
* 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
|
||||
* no need for using push constant space for that dimension.
|
||||
*
|
||||
* 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
|
||||
* easily use 16-bit words rather than 32-bit dwords in the push constant
|
||||
* data.
|
||||
*
|
||||
* 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
|
||||
* conveying the data, and thereby reduce push constant usage.
|
||||
*
|
||||
*/
|
||||
void
|
||||
fs_visitor::setup_cs_payload()
|
||||
{
|
||||
assert(devinfo->gen >= 7);
|
||||
brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
|
||||
|
||||
payload.num_regs = 1;
|
||||
|
||||
if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
|
||||
const unsigned local_id_dwords =
|
||||
brw_cs_prog_local_id_payload_dwords(dispatch_width);
|
||||
assert((local_id_dwords & 0x7) == 0);
|
||||
const unsigned local_id_regs = local_id_dwords / 8;
|
||||
prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
|
||||
payload.local_invocation_id_reg = payload.num_regs;
|
||||
payload.num_regs += local_id_regs;
|
||||
payload.num_regs += prog_data->local_invocation_id_regs;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -5171,6 +5194,42 @@ brw_wm_fs_emit(struct brw_context *brw,
|
|||
return g.get_assembly(final_assembly_size);
|
||||
}
|
||||
|
||||
void
|
||||
brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
|
||||
void *buffer, uint32_t threads, uint32_t stride)
|
||||
{
|
||||
if (prog_data->local_invocation_id_regs == 0)
|
||||
return;
|
||||
|
||||
/* 'stride' should be an integer number of registers, that is, a multiple
|
||||
* of 32 bytes.
|
||||
*/
|
||||
assert(stride % 32 == 0);
|
||||
|
||||
unsigned x = 0, y = 0, z = 0;
|
||||
for (unsigned t = 0; t < threads; t++) {
|
||||
uint32_t *param = (uint32_t *) buffer + stride * t / 4;
|
||||
|
||||
for (unsigned i = 0; i < prog_data->simd_size; i++) {
|
||||
param[0 * prog_data->simd_size + i] = x;
|
||||
param[1 * prog_data->simd_size + i] = y;
|
||||
param[2 * prog_data->simd_size + i] = z;
|
||||
|
||||
x++;
|
||||
if (x == prog_data->local_size[0]) {
|
||||
x = 0;
|
||||
y++;
|
||||
if (y == prog_data->local_size[1]) {
|
||||
y = 0;
|
||||
z++;
|
||||
if (z == prog_data->local_size[2])
|
||||
z = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fs_reg *
|
||||
fs_visitor::emit_cs_local_invocation_id_setup()
|
||||
{
|
||||
|
|
|
|||
|
|
@ -70,10 +70,8 @@ brw_upload_cs_state(struct brw_context *brw)
|
|||
|
||||
unsigned local_id_dwords = 0;
|
||||
|
||||
if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
|
||||
local_id_dwords =
|
||||
brw_cs_prog_local_id_payload_dwords(cs_prog_data->simd_size);
|
||||
}
|
||||
if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
|
||||
local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
|
||||
|
||||
unsigned push_constant_data_size =
|
||||
(prog_data->nr_params + local_id_dwords) * sizeof(gl_constant_value);
|
||||
|
|
@ -190,63 +188,6 @@ const struct brw_tracked_state brw_cs_state = {
|
|||
};
|
||||
|
||||
|
||||
/**
|
||||
* We are building the local ID push constant data using the simplest possible
|
||||
* method. We simply push the local IDs directly as they should appear in the
|
||||
* registers for the uvec3 gl_LocalInvocationID variable.
|
||||
*
|
||||
* Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
|
||||
* registers worth of push constant space.
|
||||
*
|
||||
* Note: Any updates to brw_cs_prog_local_id_payload_dwords,
|
||||
* fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
|
||||
* to coordinated.
|
||||
*
|
||||
* FINISHME: There are a few easy optimizations to consider.
|
||||
*
|
||||
* 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
|
||||
* no need for using push constant space for that dimension.
|
||||
*
|
||||
* 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
|
||||
* easily use 16-bit words rather than 32-bit dwords in the push constant
|
||||
* data.
|
||||
*
|
||||
* 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
|
||||
* conveying the data, and thereby reduce push constant usage.
|
||||
*
|
||||
*/
|
||||
unsigned
|
||||
brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width)
|
||||
{
|
||||
return 3 * dispatch_width;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
|
||||
void *buffer, unsigned *x, unsigned *y, unsigned *z)
|
||||
{
|
||||
uint32_t *param = (uint32_t *)buffer;
|
||||
for (unsigned i = 0; i < cs_prog_data->simd_size; i++) {
|
||||
param[0 * cs_prog_data->simd_size + i] = *x;
|
||||
param[1 * cs_prog_data->simd_size + i] = *y;
|
||||
param[2 * cs_prog_data->simd_size + i] = *z;
|
||||
|
||||
(*x)++;
|
||||
if (*x == cs_prog_data->local_size[0]) {
|
||||
*x = 0;
|
||||
(*y)++;
|
||||
if (*y == cs_prog_data->local_size[1]) {
|
||||
*y = 0;
|
||||
(*z)++;
|
||||
if (*z == cs_prog_data->local_size[2])
|
||||
*z = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a region containing the push constants for the CS on gen7+.
|
||||
*
|
||||
|
|
@ -269,10 +210,8 @@ brw_upload_cs_push_constants(struct brw_context *brw,
|
|||
(struct brw_stage_prog_data*) cs_prog_data;
|
||||
unsigned local_id_dwords = 0;
|
||||
|
||||
if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
|
||||
local_id_dwords =
|
||||
brw_cs_prog_local_id_payload_dwords(cs_prog_data->simd_size);
|
||||
}
|
||||
if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
|
||||
local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
|
||||
|
||||
/* Updates the ParamaterValues[i] pointers for all parameters of the
|
||||
* basic type of PROGRAM_STATE_VAR.
|
||||
|
|
@ -302,14 +241,13 @@ brw_upload_cs_push_constants(struct brw_context *brw,
|
|||
|
||||
STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
|
||||
|
||||
brw_cs_fill_local_id_payload(cs_prog_data, param, threads,
|
||||
reg_aligned_constant_size);
|
||||
|
||||
/* _NEW_PROGRAM_CONSTANTS */
|
||||
unsigned x = 0, y = 0, z = 0;
|
||||
for (t = 0; t < threads; t++) {
|
||||
gl_constant_value *next_param = ¶m[t * param_aligned_count];
|
||||
if (local_id_dwords > 0) {
|
||||
fill_local_id_payload(cs_prog_data, (void*)next_param, &x, &y, &z);
|
||||
next_param += local_id_dwords;
|
||||
}
|
||||
gl_constant_value *next_param =
|
||||
¶m[t * param_aligned_count + local_id_dwords];
|
||||
for (i = 0; i < prog_data->nr_params; i++) {
|
||||
next_param[i] = *prog_data->param[i];
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue