r600: implement gs indirect load_per_vertex_input

This functionality is useful with the software fp64
implementation. It allows running the remaining
tests.

Note: the same tests do not generate this indirect
access on cayman which has the hardware fp64
implementation enabled.

This change was tested on cypress, palm and barts.
Here are the tests fixed:
spec/arb_gpu_shader_fp64/execution/gs-isnan-dvec: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-array-copy: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-dmat4: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-dmat4-row-major: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-double-array-const-index: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-double-array-variable-index: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-double-bool-double: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-double-uniform-array-direct-indirect: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-doubles-float-mixed: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-dvec4-uniform-array-direct-indirect: fail pass
spec/arb_gpu_shader_fp64/uniform_buffers/gs-nested-struct: fail pass

Signed-off-by: Patrick Lerda <patrick9876@free.fr>
Reviewed-by: Gert Wollny <gert.wollny@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34926>
This commit is contained in:
Patrick Lerda 2025-05-09 15:13:16 +02:00 committed by Marge Bot
parent 8df9e3b2d0
commit 1186c73c6b
4 changed files with 209 additions and 13 deletions

View file

@ -1622,6 +1622,10 @@ intrinsic("load_local_shared_r600", src_comp=[0], dest_comp=0, indices = [], fla
store("local_shared_r600", [1], [WRITE_MASK])
store("tf_r600", [])
# these two definitions are aimed at r600 indirect per_vertex_input accesses
intrinsic("r600_indirect_vertex_at_index", dest_comp=1, src_comp=[1], flags=[CAN_ELIMINATE, CAN_REORDER])
load("r600_indirect_per_vertex_input", [1, 1], [BASE, RANGE, COMPONENT, DEST_TYPE, IO_SEMANTICS], [CAN_ELIMINATE, CAN_REORDER])
# AMD GCN/RDNA specific intrinsics
# This barrier is a hint that prevents moving the instruction that computes

View file

@ -697,11 +697,164 @@ r600_lower_to_scalar_instr_filter(const nir_instr *instr, const void *)
}
}
struct indirect_per_vertex {
nir_deref_instr *array_indirect_deref;
uint32_t mask;
nir_instr *saved_for_removal[R600_GS_VERTEX_INDIRECT_TOTAL][4];
unsigned obsolete_deref_count;
nir_instr *obsolete_deref[32];
};
static bool
r600_nir_gs_load_deref_io_to_indirect_per_vertex_input(nir_builder *b,
nir_intrinsic_instr *intrin,
void *cb_data)
{
struct indirect_per_vertex *indirect_per_vertex =
(struct indirect_per_vertex *)cb_data;
unsigned j;
if (intrin->intrinsic != nir_intrinsic_load_deref)
return false;
nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
if (!nir_deref_mode_is_one_of(deref, nir_var_shader_in))
return false;
nir_variable *var = nir_deref_instr_get_variable(deref);
const bool is_arrayed = nir_is_arrayed_io(var, b->shader->info.stage);
if (!is_arrayed || var->data.location != VARYING_SLOT_POS)
return false;
nir_def *array_index = deref->arr.index.ssa;
if (!array_index)
return false;
assert(intrin->def.num_components == 4);
nir_deref_instr *original_array = nir_instr_as_deref(
nir_instr_as_deref(intrin->src[0].ssa->parent_instr)->parent.ssa->parent_instr);
if (!original_array || original_array->deref_type != nir_deref_type_var ||
!glsl_type_is_array(original_array->type))
return false;
auto element_type = glsl_without_array(original_array->type);
if (element_type != &glsl_type_builtin_vec4)
return false;
const unsigned array_length = glsl_get_length(original_array->type);
assert(array_length <= R600_GS_VERTEX_INDIRECT_TOTAL);
for (j = 0; j < indirect_per_vertex->obsolete_deref_count &&
j < ARRAY_SIZE(indirect_per_vertex->obsolete_deref);
j++)
if (intrin->src[0].ssa->parent_instr == indirect_per_vertex->obsolete_deref[j])
break;
if (j == indirect_per_vertex->obsolete_deref_count &&
j != ARRAY_SIZE(indirect_per_vertex->obsolete_deref)) {
indirect_per_vertex->obsolete_deref[j] = intrin->src[0].ssa->parent_instr;
indirect_per_vertex->obsolete_deref_count++;
}
/* The next block generates a global array which is required
* for the indirect access. This array is located at the
* beginning. All the possible elements are generated. At the
* end, the elements which are not referenced are removed. */
if (!indirect_per_vertex->array_indirect_deref) {
static const char array_indirect_name[] = "r600_indirect_vertex_at_index";
b->cursor = nir_before_block(nir_start_block(b->impl));
nir_variable *array_indirect_var = nir_local_variable_create(
b->impl,
glsl_array_type(glsl_int_type(), R600_GS_VERTEX_INDIRECT_TOTAL, 0),
array_indirect_name);
indirect_per_vertex->array_indirect_deref =
nir_build_deref_var(b, array_indirect_var);
for (unsigned k = 0; k < R600_GS_VERTEX_INDIRECT_TOTAL; k++) {
nir_def *build_count = nir_imm_int(b, k);
nir_deref_instr *build_array =
nir_build_deref_array(b,
indirect_per_vertex->array_indirect_deref,
build_count);
nir_def *build_store =
nir_r600_indirect_vertex_at_index(b, intrin->def.bit_size, build_count);
nir_store_deref(b, build_array, build_store, 1);
indirect_per_vertex->saved_for_removal[k][0] = build_count->parent_instr;
indirect_per_vertex->saved_for_removal[k][1] = &build_array->instr;
indirect_per_vertex->saved_for_removal[k][2] = build_store->parent_instr;
indirect_per_vertex->saved_for_removal[k][3] =
nir_instr_next(build_store->parent_instr); // nir_store_deref
}
}
b->cursor = nir_before_instr(&intrin->instr);
for (unsigned k = 0; k < array_length; k++)
indirect_per_vertex->mask |= (1 << k);
nir_def *zero = nir_imm_int(b, 0);
nir_def *array_indirect_def = nir_load_deref(
b,
nir_build_deref_array(b, indirect_per_vertex->array_indirect_deref, array_index));
nir_def *load = nir_load_r600_indirect_per_vertex_input(b,
intrin->def.num_components,
intrin->def.bit_size,
array_indirect_def,
zero);
nir_intrinsic_set_base(nir_instr_as_intrinsic(load->parent_instr),
var->data.driver_location);
nir_def_rewrite_uses(&intrin->def, load);
nir_instr_remove(&intrin->instr);
return true;
}
static bool
r600_gs_load_deref_io_to_indirect_per_vertex_input(nir_shader *shader)
{
struct indirect_per_vertex indirect_per_vertex = {nullptr};
bool ret =
nir_shader_intrinsics_pass(shader,
r600_nir_gs_load_deref_io_to_indirect_per_vertex_input,
nir_metadata_control_flow,
&indirect_per_vertex);
if (indirect_per_vertex.array_indirect_deref) {
for (unsigned k = 0; k < R600_GS_VERTEX_INDIRECT_TOTAL; k++)
if ((indirect_per_vertex.mask & (1 << k)) == 0) {
nir_instr_remove(indirect_per_vertex.saved_for_removal[k][3]);
nir_instr_remove(indirect_per_vertex.saved_for_removal[k][2]);
nir_instr_remove(indirect_per_vertex.saved_for_removal[k][1]);
nir_instr_remove(indirect_per_vertex.saved_for_removal[k][0]);
}
for (unsigned k = 0; k < indirect_per_vertex.obsolete_deref_count; k++)
nir_instr_remove(indirect_per_vertex.obsolete_deref[k]);
}
return ret;
}
void
r600_finalize_nir_common(nir_shader *nir, enum amd_gfx_level gfx_level)
{
const int nir_lower_flrp_mask = 16 | 32 | 64;
if (nir->info.stage == MESA_SHADER_GEOMETRY) {
NIR_PASS(_, nir, r600_gs_load_deref_io_to_indirect_per_vertex_input);
}
NIR_PASS(_, nir, nir_lower_flrp, nir_lower_flrp_mask, false);
nir_lower_idiv_options idiv_options = {0};

View file

@ -29,7 +29,10 @@ GeometryShader::do_scan_instruction(nir_instr *instr)
case nir_intrinsic_store_output:
return process_store_output(ii);
case nir_intrinsic_load_per_vertex_input:
case nir_intrinsic_load_r600_indirect_per_vertex_input:
return process_load_input(ii);
case nir_intrinsic_r600_indirect_vertex_at_index:
return true;
default:
return false;
}
@ -165,7 +168,11 @@ GeometryShader::process_stage_intrinsic(nir_intrinsic_instr *intr)
case nir_intrinsic_load_invocation_id:
return emit_simple_mov(intr->def, 0, m_invocation_id);
case nir_intrinsic_load_per_vertex_input:
return emit_load_per_vertex_input(intr);
return emit_load_per_vertex_input_direct(intr);
case nir_intrinsic_load_r600_indirect_per_vertex_input:
return emit_load_per_vertex_input_indirect(intr);
case nir_intrinsic_r600_indirect_vertex_at_index:
return emit_indirect_vertex_at_index(intr);
default:;
}
return false;
@ -294,7 +301,43 @@ GeometryShader::store_output(nir_intrinsic_instr *instr)
}
bool
GeometryShader::emit_load_per_vertex_input(nir_intrinsic_instr *instr)
GeometryShader::emit_indirect_vertex_at_index(nir_intrinsic_instr *instr)
{
auto dest = value_factory().dest(instr->def, 0, pin_free);
auto literal_index = nir_src_as_const_value(instr->src[0]);
assert(literal_index);
assert(literal_index->u32 < R600_GS_VERTEX_INDIRECT_TOTAL);
auto addr = m_per_vertex_offsets[literal_index->u32];
auto ir = new AluInstr(op1_mov, dest, addr, AluInstr::write);
emit_instruction(ir);
return true;
}
bool
GeometryShader::emit_load_per_vertex_input_direct(nir_intrinsic_instr *instr)
{
auto literal_index = nir_src_as_const_value(instr->src[0]);
assert(literal_index);
assert(literal_index->u32 < R600_GS_VERTEX_INDIRECT_TOTAL);
assert(nir_intrinsic_io_semantics(instr).num_slots == 1);
return load_per_vertex_input_at_addr(instr, m_per_vertex_offsets[literal_index->u32]);
}
bool
GeometryShader::emit_load_per_vertex_input_indirect(nir_intrinsic_instr *instr)
{
return load_per_vertex_input_at_addr(
instr,
value_factory().src(instr->src[0], 0)->as_register());
}
bool
GeometryShader::load_per_vertex_input_at_addr(nir_intrinsic_instr *instr, PRegister addr)
{
auto dest = value_factory().dest_vec4(instr->def, pin_group);
@ -303,19 +346,9 @@ GeometryShader::emit_load_per_vertex_input(nir_intrinsic_instr *instr)
dest_swz[i] = i + nir_intrinsic_component(instr);
}
auto literal_index = nir_src_as_const_value(instr->src[0]);
if (!literal_index) {
sfn_log << SfnLog::err << "GS: Indirect input addressing not (yet) supported\n";
return false;
}
assert(literal_index->u32 < R600_GS_VERTEX_INDIRECT_TOTAL);
assert(nir_intrinsic_io_semantics(instr).num_slots == 1);
EVTXDataFormat fmt =
chip_class() >= ISA_CC_EVERGREEN ? fmt_invalid : fmt_32_32_32_32_float;
auto addr = m_per_vertex_offsets[literal_index->u32];
auto fetch = new LoadFromBuffer(dest,
dest_swz,
addr,

View file

@ -34,7 +34,13 @@ private:
void emit_adj_fix();
bool emit_load_per_vertex_input(nir_intrinsic_instr *instr);
bool emit_indirect_vertex_at_index(nir_intrinsic_instr *instr);
bool emit_load_per_vertex_input_direct(nir_intrinsic_instr *instr);
bool emit_load_per_vertex_input_indirect(nir_intrinsic_instr *instr);
bool load_per_vertex_input_at_addr(nir_intrinsic_instr *instr, PRegister addr);
bool load_input(UNUSED nir_intrinsic_instr *intr) override
{