From 86782156e72c1fd2e49fcbc7e301fce2b2b1f84b Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Fri, 2 Feb 2024 19:36:59 -0400 Subject: [PATCH] asahi: invert geometry shaders instead of dumping GS outputs to memory and using a GS copy vertex shader, invert the GS to get a hardware vertex shader to use directly. theoretically, reduces mem bandwidth at the cost of repeated work if we don't optimze well. perhaps more importantly, it should reduce heap usage. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/lib/agx_nir_lower_gs.c | 435 +++++++++++++++++--------- src/asahi/lib/shaders/geometry.cl | 37 ++- src/asahi/lib/shaders/geometry.h | 15 +- src/gallium/drivers/asahi/agx_state.c | 4 +- 4 files changed, 328 insertions(+), 163 deletions(-) diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c index 1189fcc9d6d..ccc8137efef 100644 --- a/src/asahi/lib/agx_nir_lower_gs.c +++ b/src/asahi/lib/agx_nir_lower_gs.c @@ -10,6 +10,7 @@ #include "gallium/include/pipe/p_defines.h" #include "shaders/geometry.h" #include "util/bitscan.h" +#include "util/list.h" #include "util/macros.h" #include "util/ralloc.h" #include "util/u_math.h" @@ -17,9 +18,16 @@ #include "nir.h" #include "nir_builder_opcodes.h" #include "nir_intrinsics.h" +#include "nir_intrinsics_indices.h" #include "nir_xfb_info.h" #include "shader_enums.h" +/* Marks a transform feedback store, which must not be stripped from the + * prepass since that's where the transform feedback happens. Chosen as a + * vendored flag not to alias other flags we'll see. + */ +#define ACCESS_XFB (ACCESS_IS_SWIZZLED_AMD) + enum gs_counter { GS_COUNTER_VERTICES = 0, GS_COUNTER_PRIMITIVES, @@ -33,10 +41,6 @@ struct lower_gs_state { int static_count[GS_NUM_COUNTERS][MAX_VERTEX_STREAMS]; nir_variable *outputs[NUM_TOTAL_VARYING_SLOTS][MAX_PRIM_OUT_SIZE]; - /* For the geometry output buffer */ - unsigned stride_B; - unsigned offset_B[NUM_TOTAL_VARYING_SLOTS]; - /* The count buffer contains `count_stride_el` 32-bit words in a row for each * input primitive, for `input_primitives * count_stride_el * 4` total bytes. */ @@ -52,21 +56,6 @@ struct lower_gs_state { bool rasterizer_discard; }; -static uint64_t -outputs_rasterized(nir_shader *s) -{ - uint64_t outputs = s->info.outputs_written; - - /* Optimize out pointless gl_PointSize outputs. Bizarrely, these occur. We - * need to preserve the transform feedback portion of the write, but we don't - * bother saving for rasterization. - */ - if (s->info.gs.output_primitive != MESA_PRIM_POINTS) - outputs &= ~VARYING_BIT_PSIZ; - - return outputs; -} - /* Helpers for loading from the geometry state buffer */ static nir_def * load_geometry_param_offset(nir_builder *b, uint32_t offset, uint8_t bytes) @@ -131,18 +120,11 @@ add_counter(nir_builder *b, nir_def *counter, nir_def *increment) } /* Helpers for lowering I/O to variables */ -bool -agx_lower_output_to_var(nir_builder *b, nir_instr *instr, void *data) +static void +lower_store_to_var(nir_builder *b, nir_intrinsic_instr *intr, + struct agx_lower_output_to_var_state *state) { - struct agx_lower_output_to_var_state *state = data; - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - return false; - - b->cursor = nir_instr_remove(instr); + b->cursor = nir_instr_remove(&intr->instr); nir_io_semantics sem = nir_intrinsic_io_semantics(intr); unsigned component = nir_intrinsic_component(intr); nir_def *value = intr->src[0].ssa; @@ -156,7 +138,7 @@ agx_lower_output_to_var(nir_builder *b, nir_instr *instr, void *data) if (!var) { assert(sem.location == VARYING_SLOT_PSIZ && "otherwise in outputs_written"); - return true; + return; } unsigned nr_components = glsl_get_components(glsl_without_array(var->type)); @@ -167,6 +149,19 @@ agx_lower_output_to_var(nir_builder *b, nir_instr *instr, void *data) component); nir_store_var(b, var, value, BITFIELD_BIT(component)); +} + +bool +agx_lower_output_to_var(nir_builder *b, nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + lower_store_to_var(b, intr, data); return true; } @@ -237,6 +232,34 @@ calc_unrolled_id(nir_builder *b) load_primitive_id(b)); } +static unsigned +output_vertex_id_stride(nir_shader *gs) +{ + /* round up to power of two for cheap multiply/division */ + return util_next_power_of_two(MAX2(gs->info.gs.vertices_out, 1)); +} + +/* Variant of calc_unrolled_id that uses a power-of-two stride for indices. This + * is sparser (acceptable for index buffer values, not for count buffer + * indices). It has the nice property of being cheap to invert, unlike + * calc_unrolled_id. So, we use calc_unrolled_id for count buffers and + * calc_unrolled_index_id for index values. + * + * This also multiplies by the appropriate stride to calculate the final index + * base value. + */ +static nir_def * +calc_unrolled_index_id(nir_builder *b) +{ + unsigned vertex_stride = output_vertex_id_stride(b->shader); + nir_def *primitives_log2 = load_geometry_param(b, primitives_log2); + + nir_def *instance = nir_ishl(b, load_instance_id(b), primitives_log2); + nir_def *prim = nir_iadd(b, instance, load_primitive_id(b)); + + return nir_imul_imm(b, prim, vertex_stride); +} + static nir_def * load_count_address(nir_builder *b, struct lower_gs_state *state, nir_def *unrolled_id, unsigned stream, @@ -355,40 +378,153 @@ agx_nir_create_geometry_count_shader(nir_shader *gs, const nir_shader *libagx, return shader; } +struct lower_gs_rast_state { + nir_def *instance_id, *primitive_id, *output_id; + struct agx_lower_output_to_var_state outputs; + struct agx_lower_output_to_var_state selected; +}; + +static void +select_rast_output(nir_builder *b, nir_intrinsic_instr *intr, + struct lower_gs_rast_state *state) +{ + b->cursor = nir_instr_remove(&intr->instr); + + /* We only care about the rasterization stream in the rasterization + * shader, so just ignore emits from other streams. + */ + if (nir_intrinsic_stream_id(intr) != 0) + return; + + u_foreach_bit64(slot, b->shader->info.outputs_written) { + nir_def *orig = nir_load_var(b, state->selected.outputs[slot]); + nir_def *data = nir_load_var(b, state->outputs.outputs[slot]); + + nir_def *value = nir_bcsel( + b, nir_ieq(b, intr->src[0].ssa, state->output_id), data, orig); + + nir_store_var(b, state->selected.outputs[slot], value, + nir_component_mask(value->num_components)); + } +} + +static bool +lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + struct lower_gs_rast_state *state = data; + + switch (intr->intrinsic) { + case nir_intrinsic_store_output: + lower_store_to_var(b, intr, &state->outputs); + return true; + + case nir_intrinsic_emit_vertex_with_counter: + select_rast_output(b, intr, state); + return true; + + case nir_intrinsic_load_primitive_id: + nir_def_rewrite_uses(&intr->def, state->primitive_id); + return true; + + case nir_intrinsic_load_instance_id: + nir_def_rewrite_uses(&intr->def, state->instance_id); + return true; + + case nir_intrinsic_load_num_vertices: { + b->cursor = nir_before_instr(&intr->instr); + nir_def_rewrite_uses(&intr->def, load_geometry_param(b, gs_grid[0])); + return true; + } + + case nir_intrinsic_load_flat_mask: + case nir_intrinsic_load_provoking_last: + /* Lowering the same in both GS variants */ + return lower_id(b, intr, data); + + case nir_intrinsic_end_primitive_with_counter: + case nir_intrinsic_set_vertex_and_primitive_count: + nir_instr_remove(&intr->instr); + return true; + + default: + return false; + } +} + /* - * Create a GS copy shader. This is a hardware vertex shader that copies each - * vertex from the geometry output buffer to the Unified Vertex Store. + * Create a GS rasterization shader. This is a hardware vertex shader that + * shades each rasterized output vertex in parallel. */ static nir_shader * -agx_nir_create_gs_copy_shader(struct lower_gs_state *state, - uint64_t outputs_written, - unsigned clip_distance_array_size, - unsigned cull_distance_array_size, - enum mesa_prim output_primitive) +agx_nir_create_gs_rast_shader(const nir_shader *gs, const nir_shader *libagx) { - nir_builder b_ = nir_builder_init_simple_shader(MESA_SHADER_VERTEX, - &agx_nir_options, "GS copy"); + /* Don't muck up the original shader */ + nir_shader *shader = nir_shader_clone(NULL, gs); + + unsigned max_verts = output_vertex_id_stride(shader); + + /* Turn into a vertex shader run only for rasterization. Transform feedback + * was handled in the prepass. + */ + shader->info.stage = MESA_SHADER_VERTEX; + shader->info.has_transform_feedback_varyings = false; + memset(&shader->info.vs, 0, sizeof(shader->info.vs)); + shader->xfb_info = NULL; + + if (shader->info.name) { + shader->info.name = ralloc_asprintf(shader, "%s_rast", shader->info.name); + } else { + shader->info.name = "gs rast"; + } + + nir_builder b_ = + nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(shader))); nir_builder *b = &b_; - b->shader->info.clip_distance_array_size = clip_distance_array_size; - b->shader->info.cull_distance_array_size = cull_distance_array_size; + /* Optimize out pointless gl_PointSize outputs. Bizarrely, these occur. */ + if (shader->info.gs.output_primitive != MESA_PRIM_POINTS) + shader->info.outputs_written &= ~VARYING_BIT_PSIZ; - /* Get the base for this vertex */ - nir_def *vert_offs = nir_imul_imm(b, nir_load_vertex_id(b), state->stride_B); + /* See calc_unrolled_index_id */ + nir_def *raw_id = nir_load_vertex_id(b); + nir_def *output_id = nir_umod_imm(b, raw_id, max_verts); + nir_def *unrolled = nir_udiv_imm(b, raw_id, max_verts); - nir_def *state_buffer = load_geometry_param(b, output_buffer); + nir_def *primitives_log2 = load_geometry_param(b, primitives_log2); + nir_def *instance_id = nir_ushr(b, unrolled, primitives_log2); + nir_def *primitive_id = nir_iand( + b, unrolled, + nir_iadd_imm(b, nir_ishl(b, nir_imm_int(b, 1), primitives_log2), -1)); - /* Each output must be copied */ - u_foreach_bit64(slot, outputs_written) { - assert(state->outputs[slot][0] != NULL); + struct lower_gs_rast_state rast_state = { + .instance_id = instance_id, + .primitive_id = primitive_id, + .output_id = output_id, + }; - nir_def *addr = nir_iadd( - b, state_buffer, - nir_u2u64(b, nir_iadd_imm(b, vert_offs, state->offset_B[slot]))); + u_foreach_bit64(slot, shader->info.outputs_written) { + const char *slot_name = + gl_varying_slot_name_for_stage(slot, MESA_SHADER_GEOMETRY); - unsigned components = glsl_get_components(state->outputs[slot][0]->type); + rast_state.outputs.outputs[slot] = nir_variable_create( + shader, nir_var_shader_temp, glsl_vector_type(GLSL_TYPE_UINT, 4), + ralloc_asprintf(shader, "%s-temp", slot_name)); - nir_def *value = nir_load_global_constant(b, addr, 4, components, 32); + rast_state.selected.outputs[slot] = nir_variable_create( + shader, nir_var_shader_temp, glsl_vector_type(GLSL_TYPE_UINT, 4), + ralloc_asprintf(shader, "%s-selected", slot_name)); + } + + nir_shader_intrinsics_pass(shader, lower_to_gs_rast, + nir_metadata_block_index | nir_metadata_dominance, + &rast_state); + + b->cursor = nir_after_impl(b->impl); + + /* Forward each selected output to the rasterizer */ + u_foreach_bit64(slot, shader->info.outputs_written) { + assert(rast_state.selected.outputs[slot] != NULL); + nir_def *value = nir_load_var(b, rast_state.selected.outputs[slot]); /* We set NIR_COMPACT_ARRAYS so clip/cull distance needs to come all in * DIST0. Undo the offset if we need to. @@ -400,9 +536,7 @@ agx_nir_create_gs_copy_shader(struct lower_gs_state *state, nir_store_output(b, value, nir_imm_int(b, offset), .io_semantics.location = slot - offset, .io_semantics.num_slots = 1, - .write_mask = nir_component_mask(components)); - - b->shader->info.outputs_written |= BITFIELD64_BIT(slot); + .write_mask = nir_component_mask(value->num_components)); } /* In OpenGL ES, it is legal to omit the point size write from the geometry @@ -413,21 +547,24 @@ agx_nir_create_gs_copy_shader(struct lower_gs_state *state, * * This should not be load bearing for other APIs, but should be harmless. */ - bool is_points = output_primitive == MESA_PRIM_POINTS; + bool is_points = gs->info.gs.output_primitive == MESA_PRIM_POINTS; - if (!(outputs_written & VARYING_BIT_PSIZ) && is_points) { + if (!(shader->info.outputs_written & VARYING_BIT_PSIZ) && is_points) { nir_store_output(b, nir_imm_float(b, 1.0), nir_imm_int(b, 0), .io_semantics.location = VARYING_SLOT_PSIZ, .io_semantics.num_slots = 1, .write_mask = nir_component_mask(1)); - b->shader->info.outputs_written |= VARYING_BIT_PSIZ; + shader->info.outputs_written |= VARYING_BIT_PSIZ; } - UNUSED struct agx_uncompiled_shader_info info; - agx_preprocess_nir(b->shader, NULL, false, &info); + nir_opt_idiv_const(shader, 16); - return b->shader; + /* Preprocess it */ + UNUSED struct agx_uncompiled_shader_info info; + agx_preprocess_nir(shader, libagx, false, &info); + + return shader; } static nir_def * @@ -486,8 +623,9 @@ static void lower_end_primitive(nir_builder *b, nir_intrinsic_instr *intr, struct lower_gs_state *state) { - assert(b->shader->info.gs.output_primitive != MESA_PRIM_POINTS && - "should've been removed"); + assert((intr->intrinsic == nir_intrinsic_set_vertex_and_primitive_count || + b->shader->info.gs.output_primitive != MESA_PRIM_POINTS) && + "endprimitive for points should've been removed"); /* The GS is the last stage before rasterization, so if we discard the * rasterization, we don't output an index buffer, nothing will read it. @@ -497,10 +635,13 @@ lower_end_primitive(nir_builder *b, nir_intrinsic_instr *intr, if (state->rasterizer_discard || stream != 0) return; - libagx_end_primitive(b, load_geometry_param(b, output_index_buffer), - intr->src[0].ssa, intr->src[1].ssa, intr->src[2].ssa, - previous_vertices(b, state, 0, calc_unrolled_id(b)), - previous_primitives(b, state, 0, calc_unrolled_id(b))); + libagx_end_primitive( + b, load_geometry_param(b, output_index_buffer), intr->src[0].ssa, + intr->src[1].ssa, intr->src[2].ssa, + previous_vertices(b, state, 0, calc_unrolled_id(b)), + previous_primitives(b, state, 0, calc_unrolled_id(b)), + calc_unrolled_index_id(b), + nir_imm_bool(b, b->shader->info.gs.output_primitive != MESA_PRIM_POINTS)); } static unsigned @@ -582,9 +723,10 @@ write_xfb(nir_builder *b, struct lower_gs_state *state, unsigned stream, nir_imm_int(b, buffer), nir_imm_int(b, stride), nir_imm_int(b, output.offset)); - nir_store_global(b, addr, 4, - nir_channels(b, value, output.component_mask), - nir_component_mask(count)); + nir_build_store_global( + b, nir_channels(b, value, output.component_mask), addr, + .align_mul = 4, .write_mask = nir_component_mask(count), + .access = ACCESS_XFB); } } @@ -599,52 +741,6 @@ static void lower_emit_vertex(nir_builder *b, nir_intrinsic_instr *intr, struct lower_gs_state *state) { - nir_def *total_vertices = intr->src[0].ssa; - - /* All previous invocations are first in the geometry output buffer */ - unsigned stream = nir_intrinsic_stream_id(intr); - nir_def *invocation_vertex_base = - previous_vertices(b, state, stream, calc_unrolled_id(b)); - - /* Calculate the number of vertices that this invocation will produce. This - * is calculated by the count shader and then prefix summed, so calculate the - * difference to undo the action of the prefix sum. - */ - nir_def *next = previous_vertices(b, state, stream, - nir_iadd_imm(b, calc_unrolled_id(b), 1)); - nir_def *our_num_verts = nir_isub(b, next, invocation_vertex_base); - - /* We can only emit vertices within bounds, since other entries in the - * geometry state buffer might belong to other invocations. This is required - * to pass glsl-1.50-geometry-end-primitive (without geometry shaders racing - * each other). - * - * TODO: This could be optimized many ways. - */ - if (!state->rasterizer_discard && stream == 0) { - nir_if *nif = nir_push_if(b, nir_ult(b, total_vertices, our_num_verts)); - { - /* The index into the geometry output buffer */ - nir_def *vertex_id = - nir_iadd(b, invocation_vertex_base, total_vertices); - - nir_def *buffer = load_geometry_param(b, output_buffer); - nir_def *vertex_offset = nir_imul_imm(b, vertex_id, state->stride_B); - nir_def *vertex_addr = - nir_iadd(b, buffer, nir_u2u64(b, vertex_offset)); - - /* Copy each output where it belongs */ - u_foreach_bit64(slot, outputs_rasterized(b->shader)) { - nir_def *addr = nir_iadd_imm(b, vertex_addr, state->offset_B[slot]); - nir_def *value = nir_load_var(b, state->outputs[slot][0]); - unsigned comps = glsl_get_components(state->outputs[slot][0]->type); - - nir_store_global(b, addr, 4, value, nir_component_mask(comps)); - } - } - nir_pop_if(b, nif); - } - /* Transform feedback is written for each decomposed output primitive. Since * we're writing strips, that means we output XFB for each vertex after the * first complete primitive is formed. @@ -697,7 +793,13 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state) switch (intr->intrinsic) { case nir_intrinsic_set_vertex_and_primitive_count: - /* This instruction is only for the count shader, so just remove */ + /* This instruction is mostly for the count shader, so just remove. But + * for points, we write the index buffer here so the rast shader can map. + */ + if (b->shader->info.gs.output_primitive == MESA_PRIM_POINTS) { + lower_end_primitive(b, intr, state); + } + break; case nir_intrinsic_end_primitive_with_counter: { @@ -750,7 +852,7 @@ collect_components(nir_builder *b, nir_intrinsic_instr *intr, void *data) */ static nir_shader * agx_nir_create_pre_gs(struct lower_gs_state *state, const nir_shader *libagx, - bool indexed, struct nir_xfb_info *xfb, + bool indexed, bool restart, struct nir_xfb_info *xfb, unsigned vertices_per_prim, uint8_t streams, unsigned invocations) { @@ -763,11 +865,11 @@ agx_nir_create_pre_gs(struct lower_gs_state *state, const nir_shader *libagx, /* Setup the draw from the rasterization stream (0). */ if (!state->rasterizer_discard) { - libagx_build_gs_draw(b, nir_load_geometry_param_buffer_agx(b), - nir_imm_bool(b, indexed), - previous_vertices(b, state, 0, unrolled_in_prims), - previous_primitives(b, state, 0, unrolled_in_prims), - nir_imm_int(b, state->stride_B)); + libagx_build_gs_draw( + b, nir_load_geometry_param_buffer_agx(b), nir_imm_bool(b, indexed), + previous_vertices(b, state, 0, unrolled_in_prims), + restart ? previous_primitives(b, state, 0, unrolled_in_prims) + : nir_imm_int(b, 0)); } /* Determine the number of primitives generated in each stream */ @@ -984,6 +1086,39 @@ agx_nir_lower_gs_instancing(nir_shader *gs) index); } +static bool +strip_side_effects(nir_builder *b, nir_intrinsic_instr *intr, void *_) +{ + switch (intr->intrinsic) { + case nir_intrinsic_store_global: + case nir_intrinsic_global_atomic: + case nir_intrinsic_global_atomic_swap: + break; + default: + return false; + } + + /* If there's a side effect that's actually required for the prepass, we have + * to keep it in. + */ + if (nir_intrinsic_infos[intr->intrinsic].has_dest && + !list_is_empty(&intr->def.uses)) + return false; + + /* Do not strip transform feedback stores, the rasterization shader doesn't + * execute them. + */ + if (intr->intrinsic == nir_intrinsic_store_global && + nir_intrinsic_access(intr) & ACCESS_XFB) + return false; + + /* Otherwise, remove the dead instruction. The rasterization shader will + * execute the side effect so the side effect still happens at least once. + */ + nir_instr_remove(&intr->instr); + return true; +} + static void link_libagx(nir_shader *nir, const nir_shader *libagx) { @@ -1068,22 +1203,10 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx, NIR_PASS(progress, gs, nir_opt_loop_unroll); } while (progress); - NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id, - nir_metadata_block_index | nir_metadata_dominance, NULL); - - link_libagx(gs, libagx); - - NIR_PASS(_, gs, nir_lower_idiv, - &(const nir_lower_idiv_options){.allow_fp16 = true}); - - /* All those variables we created should've gone away by now */ - NIR_PASS(_, gs, nir_remove_dead_variables, nir_var_function_temp, NULL); - /* If we know counts at compile-time we can simplify, so try to figure out * the counts statically. */ struct lower_gs_state gs_state = { - .stride_B = 0, .rasterizer_discard = rasterizer_discard, }; @@ -1102,6 +1225,19 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx, } } + *gs_copy = agx_nir_create_gs_rast_shader(gs, libagx); + + NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id, + nir_metadata_block_index | nir_metadata_dominance, NULL); + + link_libagx(gs, libagx); + + NIR_PASS(_, gs, nir_lower_idiv, + &(const nir_lower_idiv_options){.allow_fp16 = true}); + + /* All those variables we created should've gone away by now */ + NIR_PASS(_, gs, nir_remove_dead_variables, nir_var_function_temp, NULL); + /* If there is any unknown count, we need a geometry count shader */ if (gs_state.count_stride_el > 0) *gs_count = agx_nir_create_geometry_count_shader(gs, libagx, &gs_state); @@ -1123,11 +1259,6 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx, } state.outputs[slot] = gs_state.outputs[slot][0]; - - /* Assume fp32 output */ - unsigned size_B = 4 * component_counts[slot]; - gs_state.offset_B[slot] = gs_state.stride_B; - gs_state.stride_B += size_B; } NIR_PASS(_, gs, nir_shader_instructions_pass, agx_lower_output_to_var, @@ -1136,6 +1267,13 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx, NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_instr, nir_metadata_none, &gs_state); + /* Determine if we are guaranteed to rasterize at least one vertex, so that + * we can strip the prepass of side effects knowing they will execute in the + * rasterization shader. + */ + bool rasterizes_at_least_one_vertex = + !rasterizer_discard && gs_state.static_count[0][0] > 0; + /* Clean up after all that lowering we did */ nir_lower_global_vars_to_local(gs); do { @@ -1151,6 +1289,15 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx, NIR_PASS(progress, gs, nir_opt_dead_cf); NIR_PASS(progress, gs, nir_opt_dce); NIR_PASS(progress, gs, nir_opt_loop_unroll); + + /* When rasterizing, we try to move side effects to the rasterizer shader + * and strip the prepass of the dead side effects. Run this in the opt + * loop because it interacts with nir_opt_dce. + */ + if (rasterizes_at_least_one_vertex) { + NIR_PASS(progress, gs, nir_shader_intrinsics_pass, strip_side_effects, + nir_metadata_block_index | nir_metadata_dominance, NULL); + } } while (progress); /* All those variables we created should've gone away by now */ @@ -1162,12 +1309,8 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx, nir_metadata_block_index | nir_metadata_dominance, NULL); /* Create auxiliary programs */ - *gs_copy = agx_nir_create_gs_copy_shader( - &gs_state, outputs_rasterized(gs), gs->info.clip_distance_array_size, - gs->info.cull_distance_array_size, gs->info.gs.output_primitive); - *pre_gs = agx_nir_create_pre_gs( - &gs_state, libagx, gs->info.gs.output_primitive != MESA_PRIM_POINTS, + &gs_state, libagx, true, gs->info.gs.output_primitive != MESA_PRIM_POINTS, gs->xfb_info, verts_in_output_prim(gs), gs->info.gs.active_stream_mask, gs->info.gs.invocations); diff --git a/src/asahi/lib/shaders/geometry.cl b/src/asahi/lib/shaders/geometry.cl index 0cb638b253d..384f05ff079 100644 --- a/src/asahi/lib/shaders/geometry.cl +++ b/src/asahi/lib/shaders/geometry.cl @@ -12,6 +12,16 @@ align(uint x, uint y) return (x + y - 1) & ~(y - 1); } +/* Compatible with util/u_math.h */ +static inline uint +util_logbase2_ceil(uint n) +{ + if (n <= 1) + return 0; + else + return 32 - clz(n - 1); +} + /* Swap the two non-provoking vertices third vert in odd triangles. This * generates a vertex ID list with a consistent winding order. * @@ -279,41 +289,46 @@ libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i) * b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n * (count) is the number of verts in the prims, and -1 is the prim restart index * used to signal the end of the prim. + * + * For points, we write index buffers without restart, just as a sideband to + * pass data into the vertex shader. */ void libagx_end_primitive(global int *index_buffer, uint total_verts, uint verts_in_prim, uint total_prims, - uint invocation_vertex_base, uint invocation_prim_base) + uint invocation_vertex_base, uint invocation_prim_base, + uint geometry_base, bool restart) { /* Previous verts/prims are from previous invocations plus earlier * prims in this invocation. For the intra-invocation counts, we * subtract the count for this prim from the inclusive sum NIR gives us. */ - uint previous_verts = invocation_vertex_base + (total_verts - verts_in_prim); - uint previous_prims = invocation_prim_base + (total_prims - 1); + uint previous_verts_in_invoc = (total_verts - verts_in_prim); + uint previous_verts = invocation_vertex_base + previous_verts_in_invoc; + uint previous_prims = restart ? invocation_prim_base + (total_prims - 1) : 0; + + /* The indices are encoded as: (unrolled ID * output vertices) + vertex. */ + uint index_base = geometry_base + previous_verts_in_invoc; /* Index buffer contains 1 index for each vertex and 1 for each prim */ global int *out = &index_buffer[previous_verts + previous_prims]; /* Write out indices for the strip */ for (uint i = 0; i < verts_in_prim; ++i) { - out[i] = previous_verts + i; + out[i] = index_base + i; } - out[verts_in_prim] = -1; + if (restart) + out[verts_in_prim] = -1; } void libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed, - uint vertices, uint primitives, uint output_stride_B) + uint vertices, uint primitives) { global uint *descriptor = p->indirect_desc; global struct agx_geometry_state *state = p->state; - /* Allocate the output buffer (per vertex) */ - p->output_buffer = (global uint *)(state->heap + state->heap_bottom); - state->heap_bottom += align(vertices * output_stride_B, 4); - /* Setup the indirect draw descriptor */ if (indexed) { uint indices = vertices + primitives; /* includes restart indices */ @@ -367,6 +382,8 @@ libagx_gs_setup_indirect(global struct agx_geometry_params *p, p->gs_grid[1] = instance_count; p->gs_grid[2] = 1; + p->primitives_log2 = util_logbase2_ceil(prim_per_instance); + /* If indexing is enabled, the third word is the offset into the index buffer * in elements. Apply that offset now that we have it. For a hardware * indirect draw, the hardware would do this for us, but for software input diff --git a/src/asahi/lib/shaders/geometry.h b/src/asahi/lib/shaders/geometry.h index 29766c60b93..ff2a9629a76 100644 --- a/src/asahi/lib/shaders/geometry.h +++ b/src/asahi/lib/shaders/geometry.h @@ -113,9 +113,6 @@ struct agx_geometry_params { /* Pointers to transform feedback buffer offsets in bytes */ GLOBAL(uint) xfb_offs_ptrs[MAX_SO_BUFFERS]; - /* Output (vertex) buffer, allocated by pre-GS. */ - GLOBAL(uint) output_buffer; - /* Output index buffer, allocated by pre-GS. */ GLOBAL(uint) output_index_buffer; @@ -149,18 +146,24 @@ struct agx_geometry_params { /* Number of input vertices, part of the stride for the vertex buffer */ uint32_t input_vertices; - /* Number of input primitives, calculated by the CPU for a direct draw or the - * GS indirect setup kernel for an indirect draw. + /* Number of input primitives across all instances, calculated by the CPU for + * a direct draw or the GS indirect setup kernel for an indirect draw. */ uint32_t input_primitives; + /* Number of input primitives per instance, rounded up to a power-of-two and + * with the base-2 log taken. This is used to partition the output vertex IDs + * efficiently. + */ + uint32_t primitives_log2; + /* Number of bytes output by the GS count shader per input primitive (may be * 0), written by CPU and consumed by indirect draw setup shader for * allocating counts. */ uint32_t count_buffer_stride; } PACKED; -AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 83 * 4); +AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 82 * 4); struct agx_tess_params { /* Persistent (cross-draw) geometry state */ diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index e8a195bdab9..0df59521d52 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -4178,6 +4178,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, params.gs_grid[0] = u_decomposed_prims_for_vertices(info->mode, draw->count); + params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]); + params.input_primitives = params.gs_grid[0] * info->instance_count; params.input_vertices = draw->count; @@ -5066,7 +5068,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, /* Setup to rasterize the GS results */ info_gs = (struct pipe_draw_info){ .mode = ctx->gs->gs_output_mode, - .index_size = ctx->gs->gs_output_mode != MESA_PRIM_POINTS ? 4 : 0, + .index_size = 4, .primitive_restart = ctx->gs->gs_output_mode != MESA_PRIM_POINTS, .restart_index = ~0, .index.resource = ctx->heap,