From 1186c73c6b893c51596f5ce2bd624c40132b4854 Mon Sep 17 00:00:00 2001 From: Patrick Lerda Date: Fri, 9 May 2025 15:13:16 +0200 Subject: [PATCH] r600: implement gs indirect load_per_vertex_input This functionality is useful with the software fp64 implementation. It allows running the remaining tests. Note: the same tests do not generate this indirect access on cayman which has the hardware fp64 implementation enabled. This change was tested on cypress, palm and barts. Here are the tests fixed: spec/arb_gpu_shader_fp64/execution/gs-isnan-dvec: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-array-copy: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-dmat4: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-dmat4-row-major: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-double-array-const-index: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-double-array-variable-index: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-double-bool-double: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-double-uniform-array-direct-indirect: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-doubles-float-mixed: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-dvec4-uniform-array-direct-indirect: fail pass spec/arb_gpu_shader_fp64/uniform_buffers/gs-nested-struct: fail pass Signed-off-by: Patrick Lerda Reviewed-by: Gert Wollny Part-of: --- src/compiler/nir/nir_intrinsics.py | 4 + src/gallium/drivers/r600/sfn/sfn_nir.cpp | 153 ++++++++++++++++++ .../drivers/r600/sfn/sfn_shader_gs.cpp | 57 +++++-- src/gallium/drivers/r600/sfn/sfn_shader_gs.h | 8 +- 4 files changed, 209 insertions(+), 13 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 0e0373b03ab..99290c0ace9 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1622,6 +1622,10 @@ intrinsic("load_local_shared_r600", src_comp=[0], dest_comp=0, indices = [], fla store("local_shared_r600", [1], [WRITE_MASK]) store("tf_r600", []) +# these two definitions are aimed at r600 indirect per_vertex_input accesses +intrinsic("r600_indirect_vertex_at_index", dest_comp=1, src_comp=[1], flags=[CAN_ELIMINATE, CAN_REORDER]) +load("r600_indirect_per_vertex_input", [1, 1], [BASE, RANGE, COMPONENT, DEST_TYPE, IO_SEMANTICS], [CAN_ELIMINATE, CAN_REORDER]) + # AMD GCN/RDNA specific intrinsics # This barrier is a hint that prevents moving the instruction that computes diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp index 5838f818655..4680ceea76d 100644 --- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp @@ -697,11 +697,164 @@ r600_lower_to_scalar_instr_filter(const nir_instr *instr, const void *) } } +struct indirect_per_vertex { + nir_deref_instr *array_indirect_deref; + uint32_t mask; + nir_instr *saved_for_removal[R600_GS_VERTEX_INDIRECT_TOTAL][4]; + unsigned obsolete_deref_count; + nir_instr *obsolete_deref[32]; +}; + +static bool +r600_nir_gs_load_deref_io_to_indirect_per_vertex_input(nir_builder *b, + nir_intrinsic_instr *intrin, + void *cb_data) +{ + struct indirect_per_vertex *indirect_per_vertex = + (struct indirect_per_vertex *)cb_data; + unsigned j; + + if (intrin->intrinsic != nir_intrinsic_load_deref) + return false; + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + + if (!nir_deref_mode_is_one_of(deref, nir_var_shader_in)) + return false; + + nir_variable *var = nir_deref_instr_get_variable(deref); + const bool is_arrayed = nir_is_arrayed_io(var, b->shader->info.stage); + + if (!is_arrayed || var->data.location != VARYING_SLOT_POS) + return false; + + nir_def *array_index = deref->arr.index.ssa; + + if (!array_index) + return false; + + assert(intrin->def.num_components == 4); + + nir_deref_instr *original_array = nir_instr_as_deref( + nir_instr_as_deref(intrin->src[0].ssa->parent_instr)->parent.ssa->parent_instr); + + if (!original_array || original_array->deref_type != nir_deref_type_var || + !glsl_type_is_array(original_array->type)) + return false; + + auto element_type = glsl_without_array(original_array->type); + + if (element_type != &glsl_type_builtin_vec4) + return false; + + const unsigned array_length = glsl_get_length(original_array->type); + + assert(array_length <= R600_GS_VERTEX_INDIRECT_TOTAL); + + for (j = 0; j < indirect_per_vertex->obsolete_deref_count && + j < ARRAY_SIZE(indirect_per_vertex->obsolete_deref); + j++) + if (intrin->src[0].ssa->parent_instr == indirect_per_vertex->obsolete_deref[j]) + break; + + if (j == indirect_per_vertex->obsolete_deref_count && + j != ARRAY_SIZE(indirect_per_vertex->obsolete_deref)) { + indirect_per_vertex->obsolete_deref[j] = intrin->src[0].ssa->parent_instr; + indirect_per_vertex->obsolete_deref_count++; + } + + /* The next block generates a global array which is required + * for the indirect access. This array is located at the + * beginning. All the possible elements are generated. At the + * end, the elements which are not referenced are removed. */ + if (!indirect_per_vertex->array_indirect_deref) { + static const char array_indirect_name[] = "r600_indirect_vertex_at_index"; + + b->cursor = nir_before_block(nir_start_block(b->impl)); + + nir_variable *array_indirect_var = nir_local_variable_create( + b->impl, + glsl_array_type(glsl_int_type(), R600_GS_VERTEX_INDIRECT_TOTAL, 0), + array_indirect_name); + indirect_per_vertex->array_indirect_deref = + nir_build_deref_var(b, array_indirect_var); + + for (unsigned k = 0; k < R600_GS_VERTEX_INDIRECT_TOTAL; k++) { + nir_def *build_count = nir_imm_int(b, k); + nir_deref_instr *build_array = + nir_build_deref_array(b, + indirect_per_vertex->array_indirect_deref, + build_count); + nir_def *build_store = + nir_r600_indirect_vertex_at_index(b, intrin->def.bit_size, build_count); + nir_store_deref(b, build_array, build_store, 1); + indirect_per_vertex->saved_for_removal[k][0] = build_count->parent_instr; + indirect_per_vertex->saved_for_removal[k][1] = &build_array->instr; + indirect_per_vertex->saved_for_removal[k][2] = build_store->parent_instr; + indirect_per_vertex->saved_for_removal[k][3] = + nir_instr_next(build_store->parent_instr); // nir_store_deref + } + } + + b->cursor = nir_before_instr(&intrin->instr); + + for (unsigned k = 0; k < array_length; k++) + indirect_per_vertex->mask |= (1 << k); + + nir_def *zero = nir_imm_int(b, 0); + nir_def *array_indirect_def = nir_load_deref( + b, + nir_build_deref_array(b, indirect_per_vertex->array_indirect_deref, array_index)); + nir_def *load = nir_load_r600_indirect_per_vertex_input(b, + intrin->def.num_components, + intrin->def.bit_size, + array_indirect_def, + zero); + + nir_intrinsic_set_base(nir_instr_as_intrinsic(load->parent_instr), + var->data.driver_location); + + nir_def_rewrite_uses(&intrin->def, load); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +r600_gs_load_deref_io_to_indirect_per_vertex_input(nir_shader *shader) +{ + struct indirect_per_vertex indirect_per_vertex = {nullptr}; + bool ret = + nir_shader_intrinsics_pass(shader, + r600_nir_gs_load_deref_io_to_indirect_per_vertex_input, + nir_metadata_control_flow, + &indirect_per_vertex); + + if (indirect_per_vertex.array_indirect_deref) { + for (unsigned k = 0; k < R600_GS_VERTEX_INDIRECT_TOTAL; k++) + if ((indirect_per_vertex.mask & (1 << k)) == 0) { + nir_instr_remove(indirect_per_vertex.saved_for_removal[k][3]); + nir_instr_remove(indirect_per_vertex.saved_for_removal[k][2]); + nir_instr_remove(indirect_per_vertex.saved_for_removal[k][1]); + nir_instr_remove(indirect_per_vertex.saved_for_removal[k][0]); + } + + for (unsigned k = 0; k < indirect_per_vertex.obsolete_deref_count; k++) + nir_instr_remove(indirect_per_vertex.obsolete_deref[k]); + } + + return ret; +} + void r600_finalize_nir_common(nir_shader *nir, enum amd_gfx_level gfx_level) { const int nir_lower_flrp_mask = 16 | 32 | 64; + if (nir->info.stage == MESA_SHADER_GEOMETRY) { + NIR_PASS(_, nir, r600_gs_load_deref_io_to_indirect_per_vertex_input); + } + NIR_PASS(_, nir, nir_lower_flrp, nir_lower_flrp_mask, false); nir_lower_idiv_options idiv_options = {0}; diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_gs.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_gs.cpp index 883261f1f21..679098fa481 100644 --- a/src/gallium/drivers/r600/sfn/sfn_shader_gs.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_shader_gs.cpp @@ -29,7 +29,10 @@ GeometryShader::do_scan_instruction(nir_instr *instr) case nir_intrinsic_store_output: return process_store_output(ii); case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_r600_indirect_per_vertex_input: return process_load_input(ii); + case nir_intrinsic_r600_indirect_vertex_at_index: + return true; default: return false; } @@ -165,7 +168,11 @@ GeometryShader::process_stage_intrinsic(nir_intrinsic_instr *intr) case nir_intrinsic_load_invocation_id: return emit_simple_mov(intr->def, 0, m_invocation_id); case nir_intrinsic_load_per_vertex_input: - return emit_load_per_vertex_input(intr); + return emit_load_per_vertex_input_direct(intr); + case nir_intrinsic_load_r600_indirect_per_vertex_input: + return emit_load_per_vertex_input_indirect(intr); + case nir_intrinsic_r600_indirect_vertex_at_index: + return emit_indirect_vertex_at_index(intr); default:; } return false; @@ -294,7 +301,43 @@ GeometryShader::store_output(nir_intrinsic_instr *instr) } bool -GeometryShader::emit_load_per_vertex_input(nir_intrinsic_instr *instr) +GeometryShader::emit_indirect_vertex_at_index(nir_intrinsic_instr *instr) +{ + auto dest = value_factory().dest(instr->def, 0, pin_free); + auto literal_index = nir_src_as_const_value(instr->src[0]); + + assert(literal_index); + assert(literal_index->u32 < R600_GS_VERTEX_INDIRECT_TOTAL); + + auto addr = m_per_vertex_offsets[literal_index->u32]; + + auto ir = new AluInstr(op1_mov, dest, addr, AluInstr::write); + emit_instruction(ir); + + return true; +} + +bool +GeometryShader::emit_load_per_vertex_input_direct(nir_intrinsic_instr *instr) +{ + auto literal_index = nir_src_as_const_value(instr->src[0]); + assert(literal_index); + assert(literal_index->u32 < R600_GS_VERTEX_INDIRECT_TOTAL); + assert(nir_intrinsic_io_semantics(instr).num_slots == 1); + + return load_per_vertex_input_at_addr(instr, m_per_vertex_offsets[literal_index->u32]); +} + +bool +GeometryShader::emit_load_per_vertex_input_indirect(nir_intrinsic_instr *instr) +{ + return load_per_vertex_input_at_addr( + instr, + value_factory().src(instr->src[0], 0)->as_register()); +} + +bool +GeometryShader::load_per_vertex_input_at_addr(nir_intrinsic_instr *instr, PRegister addr) { auto dest = value_factory().dest_vec4(instr->def, pin_group); @@ -303,19 +346,9 @@ GeometryShader::emit_load_per_vertex_input(nir_intrinsic_instr *instr) dest_swz[i] = i + nir_intrinsic_component(instr); } - auto literal_index = nir_src_as_const_value(instr->src[0]); - - if (!literal_index) { - sfn_log << SfnLog::err << "GS: Indirect input addressing not (yet) supported\n"; - return false; - } - assert(literal_index->u32 < R600_GS_VERTEX_INDIRECT_TOTAL); - assert(nir_intrinsic_io_semantics(instr).num_slots == 1); - EVTXDataFormat fmt = chip_class() >= ISA_CC_EVERGREEN ? fmt_invalid : fmt_32_32_32_32_float; - auto addr = m_per_vertex_offsets[literal_index->u32]; auto fetch = new LoadFromBuffer(dest, dest_swz, addr, diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_gs.h b/src/gallium/drivers/r600/sfn/sfn_shader_gs.h index 17f0ecd2f6e..fe3d590c125 100644 --- a/src/gallium/drivers/r600/sfn/sfn_shader_gs.h +++ b/src/gallium/drivers/r600/sfn/sfn_shader_gs.h @@ -34,7 +34,13 @@ private: void emit_adj_fix(); - bool emit_load_per_vertex_input(nir_intrinsic_instr *instr); + bool emit_indirect_vertex_at_index(nir_intrinsic_instr *instr); + + bool emit_load_per_vertex_input_direct(nir_intrinsic_instr *instr); + + bool emit_load_per_vertex_input_indirect(nir_intrinsic_instr *instr); + + bool load_per_vertex_input_at_addr(nir_intrinsic_instr *instr, PRegister addr); bool load_input(UNUSED nir_intrinsic_instr *intr) override {