diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 0e0373b03ab..99290c0ace9 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1622,6 +1622,10 @@ intrinsic("load_local_shared_r600", src_comp=[0], dest_comp=0, indices = [], fla store("local_shared_r600", [1], [WRITE_MASK]) store("tf_r600", []) +# these two definitions are aimed at r600 indirect per_vertex_input accesses +intrinsic("r600_indirect_vertex_at_index", dest_comp=1, src_comp=[1], flags=[CAN_ELIMINATE, CAN_REORDER]) +load("r600_indirect_per_vertex_input", [1, 1], [BASE, RANGE, COMPONENT, DEST_TYPE, IO_SEMANTICS], [CAN_ELIMINATE, CAN_REORDER]) + # AMD GCN/RDNA specific intrinsics # This barrier is a hint that prevents moving the instruction that computes diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp index 5838f818655..4680ceea76d 100644 --- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp @@ -697,11 +697,164 @@ r600_lower_to_scalar_instr_filter(const nir_instr *instr, const void *) } } +struct indirect_per_vertex { + nir_deref_instr *array_indirect_deref; + uint32_t mask; + nir_instr *saved_for_removal[R600_GS_VERTEX_INDIRECT_TOTAL][4]; + unsigned obsolete_deref_count; + nir_instr *obsolete_deref[32]; +}; + +static bool +r600_nir_gs_load_deref_io_to_indirect_per_vertex_input(nir_builder *b, + nir_intrinsic_instr *intrin, + void *cb_data) +{ + struct indirect_per_vertex *indirect_per_vertex = + (struct indirect_per_vertex *)cb_data; + unsigned j; + + if (intrin->intrinsic != nir_intrinsic_load_deref) + return false; + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + + if (!nir_deref_mode_is_one_of(deref, nir_var_shader_in)) + return false; + + nir_variable *var = nir_deref_instr_get_variable(deref); + const bool is_arrayed = nir_is_arrayed_io(var, b->shader->info.stage); + + if (!is_arrayed || var->data.location != VARYING_SLOT_POS) + return false; + + nir_def *array_index = deref->arr.index.ssa; + + if (!array_index) + return false; + + assert(intrin->def.num_components == 4); + + nir_deref_instr *original_array = nir_instr_as_deref( + nir_instr_as_deref(intrin->src[0].ssa->parent_instr)->parent.ssa->parent_instr); + + if (!original_array || original_array->deref_type != nir_deref_type_var || + !glsl_type_is_array(original_array->type)) + return false; + + auto element_type = glsl_without_array(original_array->type); + + if (element_type != &glsl_type_builtin_vec4) + return false; + + const unsigned array_length = glsl_get_length(original_array->type); + + assert(array_length <= R600_GS_VERTEX_INDIRECT_TOTAL); + + for (j = 0; j < indirect_per_vertex->obsolete_deref_count && + j < ARRAY_SIZE(indirect_per_vertex->obsolete_deref); + j++) + if (intrin->src[0].ssa->parent_instr == indirect_per_vertex->obsolete_deref[j]) + break; + + if (j == indirect_per_vertex->obsolete_deref_count && + j != ARRAY_SIZE(indirect_per_vertex->obsolete_deref)) { + indirect_per_vertex->obsolete_deref[j] = intrin->src[0].ssa->parent_instr; + indirect_per_vertex->obsolete_deref_count++; + } + + /* The next block generates a global array which is required + * for the indirect access. This array is located at the + * beginning. All the possible elements are generated. At the + * end, the elements which are not referenced are removed. */ + if (!indirect_per_vertex->array_indirect_deref) { + static const char array_indirect_name[] = "r600_indirect_vertex_at_index"; + + b->cursor = nir_before_block(nir_start_block(b->impl)); + + nir_variable *array_indirect_var = nir_local_variable_create( + b->impl, + glsl_array_type(glsl_int_type(), R600_GS_VERTEX_INDIRECT_TOTAL, 0), + array_indirect_name); + indirect_per_vertex->array_indirect_deref = + nir_build_deref_var(b, array_indirect_var); + + for (unsigned k = 0; k < R600_GS_VERTEX_INDIRECT_TOTAL; k++) { + nir_def *build_count = nir_imm_int(b, k); + nir_deref_instr *build_array = + nir_build_deref_array(b, + indirect_per_vertex->array_indirect_deref, + build_count); + nir_def *build_store = + nir_r600_indirect_vertex_at_index(b, intrin->def.bit_size, build_count); + nir_store_deref(b, build_array, build_store, 1); + indirect_per_vertex->saved_for_removal[k][0] = build_count->parent_instr; + indirect_per_vertex->saved_for_removal[k][1] = &build_array->instr; + indirect_per_vertex->saved_for_removal[k][2] = build_store->parent_instr; + indirect_per_vertex->saved_for_removal[k][3] = + nir_instr_next(build_store->parent_instr); // nir_store_deref + } + } + + b->cursor = nir_before_instr(&intrin->instr); + + for (unsigned k = 0; k < array_length; k++) + indirect_per_vertex->mask |= (1 << k); + + nir_def *zero = nir_imm_int(b, 0); + nir_def *array_indirect_def = nir_load_deref( + b, + nir_build_deref_array(b, indirect_per_vertex->array_indirect_deref, array_index)); + nir_def *load = nir_load_r600_indirect_per_vertex_input(b, + intrin->def.num_components, + intrin->def.bit_size, + array_indirect_def, + zero); + + nir_intrinsic_set_base(nir_instr_as_intrinsic(load->parent_instr), + var->data.driver_location); + + nir_def_rewrite_uses(&intrin->def, load); + nir_instr_remove(&intrin->instr); + + return true; +} + +static bool +r600_gs_load_deref_io_to_indirect_per_vertex_input(nir_shader *shader) +{ + struct indirect_per_vertex indirect_per_vertex = {nullptr}; + bool ret = + nir_shader_intrinsics_pass(shader, + r600_nir_gs_load_deref_io_to_indirect_per_vertex_input, + nir_metadata_control_flow, + &indirect_per_vertex); + + if (indirect_per_vertex.array_indirect_deref) { + for (unsigned k = 0; k < R600_GS_VERTEX_INDIRECT_TOTAL; k++) + if ((indirect_per_vertex.mask & (1 << k)) == 0) { + nir_instr_remove(indirect_per_vertex.saved_for_removal[k][3]); + nir_instr_remove(indirect_per_vertex.saved_for_removal[k][2]); + nir_instr_remove(indirect_per_vertex.saved_for_removal[k][1]); + nir_instr_remove(indirect_per_vertex.saved_for_removal[k][0]); + } + + for (unsigned k = 0; k < indirect_per_vertex.obsolete_deref_count; k++) + nir_instr_remove(indirect_per_vertex.obsolete_deref[k]); + } + + return ret; +} + void r600_finalize_nir_common(nir_shader *nir, enum amd_gfx_level gfx_level) { const int nir_lower_flrp_mask = 16 | 32 | 64; + if (nir->info.stage == MESA_SHADER_GEOMETRY) { + NIR_PASS(_, nir, r600_gs_load_deref_io_to_indirect_per_vertex_input); + } + NIR_PASS(_, nir, nir_lower_flrp, nir_lower_flrp_mask, false); nir_lower_idiv_options idiv_options = {0}; diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_gs.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_gs.cpp index 883261f1f21..679098fa481 100644 --- a/src/gallium/drivers/r600/sfn/sfn_shader_gs.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_shader_gs.cpp @@ -29,7 +29,10 @@ GeometryShader::do_scan_instruction(nir_instr *instr) case nir_intrinsic_store_output: return process_store_output(ii); case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_r600_indirect_per_vertex_input: return process_load_input(ii); + case nir_intrinsic_r600_indirect_vertex_at_index: + return true; default: return false; } @@ -165,7 +168,11 @@ GeometryShader::process_stage_intrinsic(nir_intrinsic_instr *intr) case nir_intrinsic_load_invocation_id: return emit_simple_mov(intr->def, 0, m_invocation_id); case nir_intrinsic_load_per_vertex_input: - return emit_load_per_vertex_input(intr); + return emit_load_per_vertex_input_direct(intr); + case nir_intrinsic_load_r600_indirect_per_vertex_input: + return emit_load_per_vertex_input_indirect(intr); + case nir_intrinsic_r600_indirect_vertex_at_index: + return emit_indirect_vertex_at_index(intr); default:; } return false; @@ -294,7 +301,43 @@ GeometryShader::store_output(nir_intrinsic_instr *instr) } bool -GeometryShader::emit_load_per_vertex_input(nir_intrinsic_instr *instr) +GeometryShader::emit_indirect_vertex_at_index(nir_intrinsic_instr *instr) +{ + auto dest = value_factory().dest(instr->def, 0, pin_free); + auto literal_index = nir_src_as_const_value(instr->src[0]); + + assert(literal_index); + assert(literal_index->u32 < R600_GS_VERTEX_INDIRECT_TOTAL); + + auto addr = m_per_vertex_offsets[literal_index->u32]; + + auto ir = new AluInstr(op1_mov, dest, addr, AluInstr::write); + emit_instruction(ir); + + return true; +} + +bool +GeometryShader::emit_load_per_vertex_input_direct(nir_intrinsic_instr *instr) +{ + auto literal_index = nir_src_as_const_value(instr->src[0]); + assert(literal_index); + assert(literal_index->u32 < R600_GS_VERTEX_INDIRECT_TOTAL); + assert(nir_intrinsic_io_semantics(instr).num_slots == 1); + + return load_per_vertex_input_at_addr(instr, m_per_vertex_offsets[literal_index->u32]); +} + +bool +GeometryShader::emit_load_per_vertex_input_indirect(nir_intrinsic_instr *instr) +{ + return load_per_vertex_input_at_addr( + instr, + value_factory().src(instr->src[0], 0)->as_register()); +} + +bool +GeometryShader::load_per_vertex_input_at_addr(nir_intrinsic_instr *instr, PRegister addr) { auto dest = value_factory().dest_vec4(instr->def, pin_group); @@ -303,19 +346,9 @@ GeometryShader::emit_load_per_vertex_input(nir_intrinsic_instr *instr) dest_swz[i] = i + nir_intrinsic_component(instr); } - auto literal_index = nir_src_as_const_value(instr->src[0]); - - if (!literal_index) { - sfn_log << SfnLog::err << "GS: Indirect input addressing not (yet) supported\n"; - return false; - } - assert(literal_index->u32 < R600_GS_VERTEX_INDIRECT_TOTAL); - assert(nir_intrinsic_io_semantics(instr).num_slots == 1); - EVTXDataFormat fmt = chip_class() >= ISA_CC_EVERGREEN ? fmt_invalid : fmt_32_32_32_32_float; - auto addr = m_per_vertex_offsets[literal_index->u32]; auto fetch = new LoadFromBuffer(dest, dest_swz, addr, diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_gs.h b/src/gallium/drivers/r600/sfn/sfn_shader_gs.h index 17f0ecd2f6e..fe3d590c125 100644 --- a/src/gallium/drivers/r600/sfn/sfn_shader_gs.h +++ b/src/gallium/drivers/r600/sfn/sfn_shader_gs.h @@ -34,7 +34,13 @@ private: void emit_adj_fix(); - bool emit_load_per_vertex_input(nir_intrinsic_instr *instr); + bool emit_indirect_vertex_at_index(nir_intrinsic_instr *instr); + + bool emit_load_per_vertex_input_direct(nir_intrinsic_instr *instr); + + bool emit_load_per_vertex_input_indirect(nir_intrinsic_instr *instr); + + bool load_per_vertex_input_at_addr(nir_intrinsic_instr *instr, PRegister addr); bool load_input(UNUSED nir_intrinsic_instr *intr) override {