mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-20 18:10:11 +01:00
nir/i965: use two slots from inputs_read for dvec3/dvec4 vertex input attributes
So far, input_reads was a bitmap tracking which vertex input locations were being used. In OpenGL, an attribute bigger than a vec4 (like a dvec3 or dvec4) consumes just one location, any other small attribute. So we mark the proper bit in inputs_read, and also the same bit in double_inputs_read if the attribute is a dvec3/dvec4. But in Vulkan, this is slightly different: a dvec3/dvec4 attribute consumes two locations, not just one. And hence two bits would be marked in inputs_read for the same vertex input attribute. To avoid handling two different situations in NIR, we just choose the latest one: in OpenGL, when creating NIR from GLSL/IR, any dvec3/dvec4 vertex input attribute is marked with two bits in the inputs_read bitmap (and also in the double_inputs_read), and following attributes are adjusted accordingly. As example, if in our GLSL/IR shader we have three attributes: layout(location = 0) vec3 attr0; layout(location = 1) dvec4 attr1; layout(location = 2) dvec3 attr2; then in our NIR shader we put attr0 in location 0, attr1 in locations 1 and 2, and attr2 in location 3 and 4. Checking carefully, basically we are using slots rather than locations in NIR. When emitting the vertices, we do a inverse map to know the corresponding location for each slot. v2 (Jason): - use two slots from inputs_read for dvec3/dvec4 NIR from GLSL/IR. v3 (Jason): - Fix commit log error. - Use ladder ifs and fix braces. - elements_double is divisible by 2, don't need DIV_ROUND_UP(). - Use if ladder instead of a switch. - Add comment about hardware restriction in 64bit vertex attributes. Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
This commit is contained in:
parent
3551a2d3ad
commit
c2acf97fcc
9 changed files with 107 additions and 60 deletions
|
|
@ -129,6 +129,19 @@ private:
|
|||
|
||||
} /* end of anonymous namespace */
|
||||
|
||||
static void
|
||||
nir_remap_attributes(nir_shader *shader)
|
||||
{
|
||||
nir_foreach_variable(var, &shader->inputs) {
|
||||
var->data.location += _mesa_bitcount_64(shader->info->double_inputs_read &
|
||||
BITFIELD64_MASK(var->data.location));
|
||||
}
|
||||
|
||||
/* Once the remap is done, reset double_inputs_read, so later it will have
|
||||
* which location/slots are doubles */
|
||||
shader->info->double_inputs_read = 0;
|
||||
}
|
||||
|
||||
nir_shader *
|
||||
glsl_to_nir(const struct gl_shader_program *shader_prog,
|
||||
gl_shader_stage stage,
|
||||
|
|
@ -146,6 +159,13 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
|
|||
|
||||
nir_lower_constant_initializers(shader, (nir_variable_mode)~0);
|
||||
|
||||
/* Remap the locations to slots so those requiring two slots will occupy
|
||||
* two locations. For instance, if we have in the IR code a dvec3 attr0 in
|
||||
* location 0 and vec4 attr1 in location 1, in NIR attr0 will use
|
||||
* locations/slots 0 and 1, and attr1 will use location/slot 2 */
|
||||
if (shader->stage == MESA_SHADER_VERTEX)
|
||||
nir_remap_attributes(shader);
|
||||
|
||||
shader->info->name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
|
||||
if (shader_prog->Label)
|
||||
shader->info->label = ralloc_strdup(shader, shader_prog->Label);
|
||||
|
|
@ -322,6 +342,14 @@ nir_visitor::visit(ir_variable *ir)
|
|||
var->data.compact = ir->type->without_array()->is_scalar();
|
||||
}
|
||||
}
|
||||
|
||||
/* Mark all the locations that require two slots */
|
||||
if (glsl_type_is_dual_slot(glsl_without_array(var->type))) {
|
||||
for (uint i = 0; i < glsl_count_attribute_slots(var->type, true); i++) {
|
||||
uint64_t bitfield = BITFIELD64_BIT(var->data.location + i);
|
||||
shader->info->double_inputs_read |= bitfield;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case ir_var_shader_out:
|
||||
|
|
|
|||
|
|
@ -53,11 +53,6 @@ set_io_mask(nir_shader *shader, nir_variable *var, int offset, int len)
|
|||
else
|
||||
shader->info->inputs_read |= bitfield;
|
||||
|
||||
/* double inputs read is only for vertex inputs */
|
||||
if (shader->stage == MESA_SHADER_VERTEX &&
|
||||
glsl_type_is_dual_slot(glsl_without_array(var->type)))
|
||||
shader->info->double_inputs_read |= bitfield;
|
||||
|
||||
if (shader->stage == MESA_SHADER_FRAGMENT) {
|
||||
shader->info->fs.uses_sample_qualifier |= var->data.sample;
|
||||
}
|
||||
|
|
@ -83,26 +78,21 @@ static void
|
|||
mark_whole_variable(nir_shader *shader, nir_variable *var)
|
||||
{
|
||||
const struct glsl_type *type = var->type;
|
||||
bool is_vertex_input = false;
|
||||
|
||||
if (nir_is_per_vertex_io(var, shader->stage)) {
|
||||
assert(glsl_type_is_array(type));
|
||||
type = glsl_get_array_element(type);
|
||||
}
|
||||
|
||||
if (shader->stage == MESA_SHADER_VERTEX &&
|
||||
var->data.mode == nir_var_shader_in)
|
||||
is_vertex_input = true;
|
||||
|
||||
const unsigned slots =
|
||||
var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
|
||||
: glsl_count_attribute_slots(type, is_vertex_input);
|
||||
: glsl_count_attribute_slots(type, false);
|
||||
|
||||
set_io_mask(shader, var, 0, slots);
|
||||
}
|
||||
|
||||
static unsigned
|
||||
get_io_offset(nir_deref_var *deref, bool is_vertex_input)
|
||||
get_io_offset(nir_deref_var *deref)
|
||||
{
|
||||
unsigned offset = 0;
|
||||
|
||||
|
|
@ -117,7 +107,7 @@ get_io_offset(nir_deref_var *deref, bool is_vertex_input)
|
|||
return -1;
|
||||
}
|
||||
|
||||
offset += glsl_count_attribute_slots(tail->type, is_vertex_input) *
|
||||
offset += glsl_count_attribute_slots(tail->type, false) *
|
||||
deref_array->base_offset;
|
||||
}
|
||||
/* TODO: we can get the offset for structs here see nir_lower_io() */
|
||||
|
|
@ -163,12 +153,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var *deref)
|
|||
return false;
|
||||
}
|
||||
|
||||
bool is_vertex_input = false;
|
||||
if (shader->stage == MESA_SHADER_VERTEX &&
|
||||
var->data.mode == nir_var_shader_in)
|
||||
is_vertex_input = true;
|
||||
|
||||
unsigned offset = get_io_offset(deref, is_vertex_input);
|
||||
unsigned offset = get_io_offset(deref);
|
||||
if (offset == -1)
|
||||
return false;
|
||||
|
||||
|
|
@ -184,8 +169,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var *deref)
|
|||
}
|
||||
|
||||
/* double element width for double types that takes two slots */
|
||||
if (!is_vertex_input &&
|
||||
glsl_type_is_dual_slot(glsl_without_array(type))) {
|
||||
if (glsl_type_is_dual_slot(glsl_without_array(type))) {
|
||||
elem_width *= 2;
|
||||
}
|
||||
|
||||
|
|
@ -220,13 +204,27 @@ gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader)
|
|||
case nir_intrinsic_interp_var_at_sample:
|
||||
case nir_intrinsic_interp_var_at_offset:
|
||||
case nir_intrinsic_load_var:
|
||||
case nir_intrinsic_store_var:
|
||||
if (instr->variables[0]->var->data.mode == nir_var_shader_in ||
|
||||
instr->variables[0]->var->data.mode == nir_var_shader_out) {
|
||||
case nir_intrinsic_store_var: {
|
||||
nir_variable *var = instr->variables[0]->var;
|
||||
|
||||
if (var->data.mode == nir_var_shader_in ||
|
||||
var->data.mode == nir_var_shader_out) {
|
||||
if (!try_mask_partial_io(shader, instr->variables[0]))
|
||||
mark_whole_variable(shader, instr->variables[0]->var);
|
||||
mark_whole_variable(shader, var);
|
||||
|
||||
/* We need to track which input_reads bits correspond to a
|
||||
* dvec3/dvec4 input attribute */
|
||||
if (shader->stage == MESA_SHADER_VERTEX &&
|
||||
var->data.mode == nir_var_shader_in &&
|
||||
glsl_type_is_dual_slot(glsl_without_array(var->type))) {
|
||||
for (uint i = 0; i < glsl_count_attribute_slots(var->type, false); i++) {
|
||||
int idx = var->data.location + i;
|
||||
shader->info->double_inputs_read |= BITFIELD64_BIT(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_draw_id:
|
||||
case nir_intrinsic_load_front_face:
|
||||
|
|
|
|||
|
|
@ -42,9 +42,35 @@ vertex_element_comp_control(enum isl_format format, unsigned comp)
|
|||
default: unreachable("Invalid component");
|
||||
}
|
||||
|
||||
/*
|
||||
* Take in account hardware restrictions when dealing with 64-bit floats.
|
||||
*
|
||||
* From Broadwell spec, command reference structures, page 586:
|
||||
* "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
|
||||
* 64-bit components are stored * in the URB without any conversion. In
|
||||
* this case, vertex elements must be written as 128 or 256 bits, with
|
||||
* VFCOMP_STORE_0 being used to pad the output as required. E.g., if
|
||||
* R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
|
||||
* Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
|
||||
* set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
|
||||
* Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
|
||||
* a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
|
||||
* Component 3 to be specified as VFCOMP_STORE_0 in order to output a
|
||||
* 256-bit vertex element."
|
||||
*/
|
||||
if (bits) {
|
||||
return VFCOMP_STORE_SRC;
|
||||
} else if (comp < 3) {
|
||||
} else if (comp >= 2 &&
|
||||
!isl_format_layouts[format].channels.b.bits &&
|
||||
isl_format_layouts[format].channels.r.type == ISL_RAW) {
|
||||
/* When emitting 64-bit attributes, we need to write either 128 or 256
|
||||
* bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
|
||||
* VFCOMP_STORE_0 to pad the written chunk */
|
||||
return VFCOMP_NOSTORE;
|
||||
} else if (comp < 3 ||
|
||||
isl_format_layouts[format].channels.r.type == ISL_RAW) {
|
||||
/* Note we need to pad with value 0, not 1, due hardware restrictions
|
||||
* (see comment above) */
|
||||
return VFCOMP_STORE_0;
|
||||
} else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
|
||||
isl_format_layouts[format].channels.r.type == ISL_SINT) {
|
||||
|
|
@ -64,8 +90,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,
|
|||
|
||||
/* Pull inputs_read out of the VS prog data */
|
||||
const uint64_t inputs_read = vs_prog_data->inputs_read;
|
||||
const uint64_t double_inputs_read = vs_prog_data->double_inputs_read;
|
||||
assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
|
||||
const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
|
||||
const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
|
||||
|
||||
#if GEN_GEN >= 8
|
||||
/* On BDW+, we only need to allocate space for base ids. Setting up
|
||||
|
|
@ -83,13 +111,16 @@ emit_vertex_input(struct anv_pipeline *pipeline,
|
|||
vs_prog_data->uses_baseinstance;
|
||||
#endif
|
||||
|
||||
uint32_t elem_count = __builtin_popcount(elements) + needs_svgs_elem;
|
||||
if (elem_count == 0)
|
||||
uint32_t elem_count = __builtin_popcount(elements) -
|
||||
__builtin_popcount(elements_double) / 2;
|
||||
|
||||
uint32_t total_elems = elem_count + needs_svgs_elem;
|
||||
if (total_elems == 0)
|
||||
return;
|
||||
|
||||
uint32_t *p;
|
||||
|
||||
const uint32_t num_dwords = 1 + elem_count * 2;
|
||||
const uint32_t num_dwords = 1 + total_elems * 2;
|
||||
p = anv_batch_emitn(&pipeline->batch, num_dwords,
|
||||
GENX(3DSTATE_VERTEX_ELEMENTS));
|
||||
memset(p + 1, 0, (num_dwords - 1) * 4);
|
||||
|
|
@ -107,7 +138,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,
|
|||
if ((elements & (1 << desc->location)) == 0)
|
||||
continue; /* Binding unused */
|
||||
|
||||
uint32_t slot = __builtin_popcount(elements & ((1 << desc->location) - 1));
|
||||
uint32_t slot =
|
||||
__builtin_popcount(elements & ((1 << desc->location) - 1)) -
|
||||
DIV_ROUND_UP(__builtin_popcount(elements_double &
|
||||
((1 << desc->location) -1)), 2);
|
||||
|
||||
struct GENX(VERTEX_ELEMENT_STATE) element = {
|
||||
.VertexBufferIndex = desc->binding,
|
||||
|
|
@ -137,7 +171,7 @@ emit_vertex_input(struct anv_pipeline *pipeline,
|
|||
#endif
|
||||
}
|
||||
|
||||
const uint32_t id_slot = __builtin_popcount(elements);
|
||||
const uint32_t id_slot = elem_count;
|
||||
if (needs_svgs_elem) {
|
||||
/* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
|
||||
* "Within a VERTEX_ELEMENT_STATE structure, if a Component
|
||||
|
|
|
|||
|
|
@ -481,11 +481,16 @@ brw_prepare_vertices(struct brw_context *brw)
|
|||
/* Accumulate the list of enabled arrays. */
|
||||
brw->vb.nr_enabled = 0;
|
||||
while (vs_inputs) {
|
||||
GLuint index = ffsll(vs_inputs) - 1;
|
||||
GLuint first = ffsll(vs_inputs) - 1;
|
||||
GLuint index =
|
||||
first - DIV_ROUND_UP(_mesa_bitcount_64(vs_prog_data->double_inputs_read &
|
||||
BITFIELD64_MASK(first)), 2);
|
||||
struct brw_vertex_element *input = &brw->vb.inputs[index];
|
||||
input->is_dual_slot = brw->gen >= 8 &&
|
||||
(vs_prog_data->double_inputs_read & BITFIELD64_BIT(index)) != 0;
|
||||
vs_inputs &= ~BITFIELD64_BIT(index);
|
||||
(vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) != 0;
|
||||
vs_inputs &= ~BITFIELD64_BIT(first);
|
||||
if (input->is_dual_slot)
|
||||
vs_inputs &= ~BITFIELD64_BIT(first + 1);
|
||||
brw->vb.enabled[brw->vb.nr_enabled++] = input;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -492,19 +492,6 @@ type_size_scalar(const struct glsl_type *type)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Attribute arrays are loaded as one vec4 per element (or matrix column),
|
||||
* except for double-precision types, which are loaded as one dvec4.
|
||||
*/
|
||||
extern "C" int
|
||||
type_size_vs_input(const struct glsl_type *type)
|
||||
{
|
||||
if (type->is_double()) {
|
||||
return type_size_dvec4(type);
|
||||
} else {
|
||||
return type_size_vec4(type);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a MOV to read the timestamp register.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -36,8 +36,7 @@ fs_reg *
|
|||
fs_visitor::emit_vs_system_value(int location)
|
||||
{
|
||||
fs_reg *reg = new(this->mem_ctx)
|
||||
fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) +
|
||||
_mesa_bitcount_64(nir->info->double_inputs_read)),
|
||||
fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read),
|
||||
BRW_REGISTER_TYPE_D);
|
||||
struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
|
||||
|
||||
|
|
|
|||
|
|
@ -113,9 +113,7 @@ remap_vs_attrs(nir_block *block, shader_info *nir_info)
|
|||
int attr = intrin->const_index[0];
|
||||
int slot = _mesa_bitcount_64(nir_info->inputs_read &
|
||||
BITFIELD64_MASK(attr));
|
||||
int dslot = _mesa_bitcount_64(nir_info->double_inputs_read &
|
||||
BITFIELD64_MASK(attr));
|
||||
intrin->const_index[0] = 4 * (slot + dslot);
|
||||
intrin->const_index[0] = 4 * slot;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
|
@ -268,7 +266,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir,
|
|||
* loaded as one vec4 or dvec4 per element (or matrix column), depending on
|
||||
* whether it is a double-precision type or not.
|
||||
*/
|
||||
nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0);
|
||||
nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
|
||||
|
||||
/* This pass needs actual constants */
|
||||
nir_opt_constant_folding(nir);
|
||||
|
|
|
|||
|
|
@ -34,7 +34,6 @@ extern "C" {
|
|||
int type_size_scalar(const struct glsl_type *type);
|
||||
int type_size_vec4(const struct glsl_type *type);
|
||||
int type_size_dvec4(const struct glsl_type *type);
|
||||
int type_size_vs_input(const struct glsl_type *type);
|
||||
|
||||
static inline int
|
||||
type_size_scalar_bytes(const struct glsl_type *type)
|
||||
|
|
|
|||
|
|
@ -2737,7 +2737,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
|
|||
((1 << shader->info->cull_distance_array_size) - 1) <<
|
||||
shader->info->clip_distance_array_size;
|
||||
|
||||
unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
|
||||
unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read);
|
||||
|
||||
/* gl_VertexID and gl_InstanceID are system values, but arrive via an
|
||||
* incoming vertex attribute. So, add an extra slot.
|
||||
|
|
@ -2747,18 +2747,17 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
|
|||
BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
|
||||
BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
|
||||
BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
|
||||
nr_attributes++;
|
||||
nr_attribute_slots++;
|
||||
}
|
||||
|
||||
/* gl_DrawID has its very own vec4 */
|
||||
if (shader->info->system_values_read &
|
||||
BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
|
||||
nr_attributes++;
|
||||
nr_attribute_slots++;
|
||||
}
|
||||
|
||||
unsigned nr_attribute_slots =
|
||||
nr_attributes +
|
||||
_mesa_bitcount_64(shader->info->double_inputs_read);
|
||||
unsigned nr_attributes = nr_attribute_slots -
|
||||
DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2);
|
||||
|
||||
/* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
|
||||
* Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue