i965/cs: Split out helper for building local id payload

The initial motivation for this patch was to avoid calling
brw_cs_prog_local_id_payload_dwords() in gen7_cs_state.c from the
compiler. This commit ends up refactoring things a bit more so as to
split out the logic to build the local id payload to brw_fs.cpp. This
moves the payload building closer to the compiler code that uses the
payload layout and makes it available to other users of the compiler.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Signed-off-by: Kristian Høgsberg Kristensen <krh@bitplanet.net>
This commit is contained in:
Kristian Høgsberg Kristensen 2015-10-05 22:07:58 -07:00
parent 4f33700f5a
commit 469d0e449b
4 changed files with 77 additions and 78 deletions

View file

@ -484,6 +484,7 @@ struct brw_cs_prog_data {
unsigned simd_size;
bool uses_barrier;
bool uses_num_work_groups;
unsigned local_invocation_id_regs;
struct {
/** @{

View file

@ -48,8 +48,9 @@ brw_cs_emit(struct brw_context *brw,
struct gl_shader_program *prog,
unsigned *final_assembly_size);
unsigned
brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width);
void
brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
void *buffer, uint32_t threads, uint32_t stride);
#ifdef __cplusplus
}

View file

@ -4718,20 +4718,43 @@ fs_visitor::setup_vs_payload()
payload.num_regs = 2;
}
/**
* We are building the local ID push constant data using the simplest possible
* method. We simply push the local IDs directly as they should appear in the
* registers for the uvec3 gl_LocalInvocationID variable.
*
* Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
* registers worth of push constant space.
*
* Note: Any updates to brw_cs_prog_local_id_payload_dwords,
* fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
* to coordinated.
*
* FINISHME: There are a few easy optimizations to consider.
*
* 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
* no need for using push constant space for that dimension.
*
* 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
* easily use 16-bit words rather than 32-bit dwords in the push constant
* data.
*
* 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
* conveying the data, and thereby reduce push constant usage.
*
*/
void
fs_visitor::setup_cs_payload()
{
assert(devinfo->gen >= 7);
brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
payload.num_regs = 1;
if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
const unsigned local_id_dwords =
brw_cs_prog_local_id_payload_dwords(dispatch_width);
assert((local_id_dwords & 0x7) == 0);
const unsigned local_id_regs = local_id_dwords / 8;
prog_data->local_invocation_id_regs = dispatch_width * 3 / 8;
payload.local_invocation_id_reg = payload.num_regs;
payload.num_regs += local_id_regs;
payload.num_regs += prog_data->local_invocation_id_regs;
}
}
@ -5171,6 +5194,42 @@ brw_wm_fs_emit(struct brw_context *brw,
return g.get_assembly(final_assembly_size);
}
void
brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data,
void *buffer, uint32_t threads, uint32_t stride)
{
if (prog_data->local_invocation_id_regs == 0)
return;
/* 'stride' should be an integer number of registers, that is, a multiple
* of 32 bytes.
*/
assert(stride % 32 == 0);
unsigned x = 0, y = 0, z = 0;
for (unsigned t = 0; t < threads; t++) {
uint32_t *param = (uint32_t *) buffer + stride * t / 4;
for (unsigned i = 0; i < prog_data->simd_size; i++) {
param[0 * prog_data->simd_size + i] = x;
param[1 * prog_data->simd_size + i] = y;
param[2 * prog_data->simd_size + i] = z;
x++;
if (x == prog_data->local_size[0]) {
x = 0;
y++;
if (y == prog_data->local_size[1]) {
y = 0;
z++;
if (z == prog_data->local_size[2])
z = 0;
}
}
}
}
}
fs_reg *
fs_visitor::emit_cs_local_invocation_id_setup()
{

View file

@ -70,10 +70,8 @@ brw_upload_cs_state(struct brw_context *brw)
unsigned local_id_dwords = 0;
if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
local_id_dwords =
brw_cs_prog_local_id_payload_dwords(cs_prog_data->simd_size);
}
if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
unsigned push_constant_data_size =
(prog_data->nr_params + local_id_dwords) * sizeof(gl_constant_value);
@ -190,63 +188,6 @@ const struct brw_tracked_state brw_cs_state = {
};
/**
* We are building the local ID push constant data using the simplest possible
* method. We simply push the local IDs directly as they should appear in the
* registers for the uvec3 gl_LocalInvocationID variable.
*
* Therefore, for SIMD8, we use 3 full registers, and for SIMD16 we use 6
* registers worth of push constant space.
*
* Note: Any updates to brw_cs_prog_local_id_payload_dwords,
* fill_local_id_payload or fs_visitor::emit_cs_local_invocation_id_setup need
* to coordinated.
*
* FINISHME: There are a few easy optimizations to consider.
*
* 1. If gl_WorkGroupSize x, y or z is 1, we can just use zero, and there is
* no need for using push constant space for that dimension.
*
* 2. Since GL_MAX_COMPUTE_WORK_GROUP_SIZE is currently 1024 or less, we can
* easily use 16-bit words rather than 32-bit dwords in the push constant
* data.
*
* 3. If gl_WorkGroupSize x, y or z is small, then we can use bytes for
* conveying the data, and thereby reduce push constant usage.
*
*/
unsigned
brw_cs_prog_local_id_payload_dwords(unsigned dispatch_width)
{
return 3 * dispatch_width;
}
static void
fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data,
void *buffer, unsigned *x, unsigned *y, unsigned *z)
{
uint32_t *param = (uint32_t *)buffer;
for (unsigned i = 0; i < cs_prog_data->simd_size; i++) {
param[0 * cs_prog_data->simd_size + i] = *x;
param[1 * cs_prog_data->simd_size + i] = *y;
param[2 * cs_prog_data->simd_size + i] = *z;
(*x)++;
if (*x == cs_prog_data->local_size[0]) {
*x = 0;
(*y)++;
if (*y == cs_prog_data->local_size[1]) {
*y = 0;
(*z)++;
if (*z == cs_prog_data->local_size[2])
*z = 0;
}
}
}
}
/**
* Creates a region containing the push constants for the CS on gen7+.
*
@ -269,10 +210,8 @@ brw_upload_cs_push_constants(struct brw_context *brw,
(struct brw_stage_prog_data*) cs_prog_data;
unsigned local_id_dwords = 0;
if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
local_id_dwords =
brw_cs_prog_local_id_payload_dwords(cs_prog_data->simd_size);
}
if (prog->SystemValuesRead & SYSTEM_BIT_LOCAL_INVOCATION_ID)
local_id_dwords = cs_prog_data->local_invocation_id_regs * 8;
/* Updates the ParamaterValues[i] pointers for all parameters of the
* basic type of PROGRAM_STATE_VAR.
@ -302,14 +241,13 @@ brw_upload_cs_push_constants(struct brw_context *brw,
STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
brw_cs_fill_local_id_payload(cs_prog_data, param, threads,
reg_aligned_constant_size);
/* _NEW_PROGRAM_CONSTANTS */
unsigned x = 0, y = 0, z = 0;
for (t = 0; t < threads; t++) {
gl_constant_value *next_param = &param[t * param_aligned_count];
if (local_id_dwords > 0) {
fill_local_id_payload(cs_prog_data, (void*)next_param, &x, &y, &z);
next_param += local_id_dwords;
}
gl_constant_value *next_param =
&param[t * param_aligned_count + local_id_dwords];
for (i = 0; i < prog_data->nr_params; i++) {
next_param[i] = *prog_data->param[i];
}