diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c index 0202cea00a9..a3ee09f595b 100644 --- a/src/asahi/lib/agx_nir_lower_gs.c +++ b/src/asahi/lib/agx_nir_lower_gs.c @@ -1,6 +1,7 @@ /* * Copyright 2023 Alyssa Rosenzweig * Copyright 2023 Valve Corporation + * Copyright 2015 Intel Corporation * SPDX-License-Identifier: MIT */ @@ -22,11 +23,143 @@ #include "nir_xfb_info.h" #include "shader_enums.h" -#define MAX_PRIM_OUT_SIZE 3 +struct state { + nir_variable *vertices[NIR_MAX_XFB_STREAMS]; + nir_variable *first_vertex[NIR_MAX_XFB_STREAMS]; + nir_variable *xfb_count[NIR_MAX_XFB_STREAMS]; + nir_variable *indices; +}; + +static void +emit_primitive(nir_builder *b, struct state *state, unsigned stream) +{ + unsigned min_verts = nir_verts_in_output_prim(b->shader); + bool restart = min_verts > 1; + + nir_def *indices = nir_load_var(b, state->indices); + nir_def *first_vertex = nir_load_var(b, state->first_vertex[stream]); + nir_def *total_vertices = nir_load_var(b, state->vertices[stream]); + nir_def *xfb_count = nir_load_var(b, state->xfb_count[stream]); + nir_def *length = nir_isub(b, total_vertices, first_vertex); + + nir_emit_primitive_poly(b, indices, first_vertex, length, xfb_count, stream); + + /* Allocate index buffer space */ + nir_def *degenerate = nir_ult_imm(b, length, min_verts); + nir_def *added_indices = nir_iadd_imm(b, length, restart); + added_indices = nir_bcsel(b, degenerate, nir_imm_int(b, 0), added_indices); + nir_store_var(b, state->indices, nir_iadd(b, indices, added_indices), 0x1); + + /* We form a new primitive for every vertex emitted after the first + * complete primitive (since we're outputting strips). + */ + nir_def *xfb_prims = nir_iadd_imm(b, length, -(min_verts - 1)); + xfb_prims = nir_bcsel(b, degenerate, nir_imm_int(b, 0), xfb_prims); + nir_store_var(b, state->xfb_count[stream], nir_iadd(b, xfb_count, xfb_prims), + 0x1); + + nir_store_var(b, state->first_vertex[stream], total_vertices, 0x1); +} + +static bool +rewrite_intrinsics(nir_builder *b, nir_intrinsic_instr *intr, void *state_) +{ + b->cursor = nir_before_instr(&intr->instr); + struct state *state = state_; + + if (intr->intrinsic == nir_intrinsic_emit_vertex) { + unsigned stream = nir_intrinsic_stream_id(intr); + + nir_def *count = nir_load_var(b, state->vertices[stream]); + nir_select_vertex_poly(b, count, stream); + nir_store_var(b, state->vertices[stream], nir_iadd_imm(b, count, 1), 0x1); + } else if (intr->intrinsic == nir_intrinsic_end_primitive) { + /* Emit is deferred for points */ + if (b->shader->info.gs.output_primitive != MESA_PRIM_POINTS) + emit_primitive(b, state, nir_intrinsic_stream_id(intr)); + } else { + return false; + } + + nir_instr_remove(&intr->instr); + return true; +} + +static bool +agx_nir_lower_gs_intrinsics(nir_shader *shader) +{ + struct state state; + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + nir_builder b = nir_builder_at(nir_before_impl(impl)); + nir_def *zero = nir_imm_int(&b, 0); + const glsl_type *T = glsl_uint_type(); + + for (unsigned i = 0; i < NIR_MAX_XFB_STREAMS; ++i) { + state.vertices[i] = nir_local_variable_create(impl, T, NULL); + state.first_vertex[i] = nir_local_variable_create(impl, T, NULL); + state.xfb_count[i] = nir_local_variable_create(impl, T, NULL); + + nir_store_var(&b, state.vertices[i], zero, 0x1); + nir_store_var(&b, state.first_vertex[i], zero, 0x1); + nir_store_var(&b, state.xfb_count[i], zero, 0x1); + } + + state.indices = nir_local_variable_create(impl, T, NULL); + nir_store_var(&b, state.indices, zero, 0x1); + + /* Make sure all the primitives are ended at the end of the shader. */ + b.cursor = nir_after_impl(impl); + + u_foreach_bit(stream, shader->info.gs.active_stream_mask) { + nir_end_primitive(&b, stream); + } + + nir_shader_intrinsics_pass(shader, rewrite_intrinsics, + nir_metadata_control_flow, &state); + + b.cursor = nir_after_impl(impl); + + if (shader->info.gs.output_primitive == MESA_PRIM_POINTS) { + u_foreach_bit(stream, shader->info.gs.active_stream_mask) { + emit_primitive(&b, &state, stream); + } + } + + /* If we have side effects, make sure we run the geometry shader at least + * once by outputting a dummy primitive if we wouldn't output anything. + */ + if (shader->info.writes_memory) { + unsigned n = nir_verts_in_output_prim(shader); + shader->info.gs.vertices_out = MAX2(shader->info.gs.vertices_out, n); + + nir_push_if(&b, nir_ieq_imm(&b, nir_load_var(&b, state.indices), 0)); + { + nir_def *zero = nir_imm_int(&b, 0); + nir_def *n_ = nir_imm_int(&b, n); + bool restart = n > 1; + + shader->info.outputs_written |= VARYING_BIT_POS; + nir_store_output(&b, nir_imm_float(&b, NAN), zero, + .io_semantics.location = VARYING_SLOT_POS); + nir_select_vertex_poly(&b, zero); + nir_emit_primitive_poly(&b, zero, zero, n_, zero); + nir_store_var(&b, state.indices, nir_iadd_imm(&b, n_, restart), 1); + } + nir_pop_if(&b, NULL); + } + + /* Report the counts */ + for (unsigned stream = 0; stream < NIR_MAX_XFB_STREAMS; ++stream) { + nir_set_vertex_and_primitive_count( + &b, nir_imm_int(&b, 0), nir_load_var(&b, state.indices), + nir_load_var(&b, state.xfb_count[stream]), stream); + } + + return nir_progress(true, impl, nir_metadata_none); +} struct lower_gs_state { int static_count[MAX_VERTEX_STREAMS]; - nir_variable *outputs[NUM_TOTAL_VARYING_SLOTS][MAX_PRIM_OUT_SIZE]; /* The index of each counter in the count buffer, or -1 if it's not in the * count buffer. @@ -35,8 +168,6 @@ struct lower_gs_state { */ int count_index[MAX_VERTEX_STREAMS]; - bool rasterizer_discard; - struct agx_gs_info *info; }; @@ -93,20 +224,6 @@ lower_store_to_var(nir_builder *b, nir_intrinsic_instr *intr, nir_store_var(b, var, value, BITFIELD_BIT(component)); } -static bool -lower_output_to_var(nir_builder *b, nir_instr *instr, void *data) -{ - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - return false; - - lower_store_to_var(b, intr, data); - return true; -} - /* * Geometry shader invocations are compute-like: * @@ -278,9 +395,9 @@ static bool lower_gs_count_instr(nir_builder *b, nir_intrinsic_instr *intr, void *data) { switch (intr->intrinsic) { - case nir_intrinsic_emit_vertex_with_counter: - case nir_intrinsic_end_primitive_with_counter: case nir_intrinsic_store_output: + case nir_intrinsic_select_vertex_poly: + case nir_intrinsic_emit_primitive_poly: /* These are for the main shader, just remove them */ nir_instr_remove(&intr->instr); return true; @@ -349,9 +466,12 @@ agx_nir_create_geometry_count_shader(nir_shader *gs, struct lower_gs_rast_state { nir_def *raw_instance_id; - nir_def *instance_id, *primitive_id, *output_id; + nir_def *instance_id, *primitive_id, *output_id, *stream; struct lower_output_to_var_state outputs; struct lower_output_to_var_state selected; + bool points; + + nir_variable *output_strip_length, *output_strip_base, *id_in_strip; }; static void @@ -359,19 +479,15 @@ select_rast_output(nir_builder *b, nir_intrinsic_instr *intr, struct lower_gs_rast_state *state) { b->cursor = nir_instr_remove(&intr->instr); - - /* We only care about the rasterization stream in the rasterization - * shader, so just ignore emits from other streams. - */ - if (nir_intrinsic_stream_id(intr) != 0) - return; + nir_def *us = nir_ieq(b, intr->src[0].ssa, state->output_id); + us = nir_iand(b, us, + nir_ieq_imm(b, state->stream, nir_intrinsic_stream_id(intr))); u_foreach_bit64(slot, b->shader->info.outputs_written) { nir_def *orig = nir_load_var(b, state->selected.outputs[slot]); nir_def *data = nir_load_var(b, state->outputs.outputs[slot]); - nir_def *value = nir_bcsel( - b, nir_ieq(b, intr->src[0].ssa, state->output_id), data, orig); + nir_def *value = nir_bcsel(b, us, data, orig); nir_store_var(b, state->selected.outputs[slot], value, nir_component_mask(value->num_components)); @@ -388,7 +504,7 @@ lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data) lower_store_to_var(b, intr, &state->outputs); return true; - case nir_intrinsic_emit_vertex_with_counter: + case nir_intrinsic_select_vertex_poly: select_rast_output(b, intr, state); return true; @@ -411,7 +527,37 @@ lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data) return lower_id(b, intr, NULL); } - case nir_intrinsic_end_primitive_with_counter: + case nir_intrinsic_emit_primitive_poly: { + b->cursor = nir_before_instr(&intr->instr); + nir_def *id = state->output_id; + + nir_def *first_id = intr->src[1].ssa; + nir_def *length = intr->src[2].ssa; + nir_def *base = intr->src[3].ssa; + nir_def *id_in_strip = nir_isub(b, id, first_id); + + nir_def *us = nir_ult(b, id, nir_iadd(b, first_id, length)); + us = nir_iand(b, us, nir_uge(b, id, first_id)); + us = nir_iand( + b, us, nir_ieq_imm(b, state->stream, nir_intrinsic_stream_id(intr))); + + nir_def *orig = nir_load_var(b, state->output_strip_length); + nir_def *value = nir_bcsel(b, us, length, orig); + nir_store_var(b, state->output_strip_length, value, + nir_component_mask(1)); + + orig = nir_load_var(b, state->output_strip_base); + value = nir_bcsel(b, us, base, orig); + nir_store_var(b, state->output_strip_base, value, nir_component_mask(1)); + + orig = nir_load_var(b, state->id_in_strip); + value = nir_bcsel(b, us, id_in_strip, orig); + nir_store_var(b, state->id_in_strip, value, nir_component_mask(1)); + + nir_instr_remove(&intr->instr); + return true; + } + case nir_intrinsic_set_vertex_and_primitive_count: nir_instr_remove(&intr->instr); return true; @@ -421,101 +567,6 @@ lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data) } } -/* - * Side effects in geometry shaders are problematic with our "GS rasterization - * shader" implementation. Where does the side effect happen? In the prepass? - * In the rast shader? In both? - * - * A perfect solution is impossible with rast shaders. Since the spec is loose - * here, we follow the principle of "least surprise": - * - * 1. Prefer side effects in the prepass over the rast shader. The prepass runs - * once per API GS invocation so will match the expectations of buggy apps - * not written for tilers. - * - * 2. If we must execute any side effect in the rast shader, try to execute all - * side effects only in the rast shader. If some side effects must happen in - * the rast shader and others don't, this gets consistent counts - * (i.e. if the app expects plain stores and atomics to match up). - * - * 3. If we must execute side effects in both rast and the prepass, - * execute all side effects in the rast shader and strip what we can from - * the prepass. This gets the "unsurprising" behaviour from #2 without - * falling over for ridiculous uses of atomics. - */ -static bool -strip_side_effect_from_rast(nir_builder *b, nir_intrinsic_instr *intr, - void *data) -{ - switch (intr->intrinsic) { - case nir_intrinsic_store_global: - case nir_intrinsic_global_atomic: - case nir_intrinsic_global_atomic_swap: - break; - default: - return false; - } - - /* If there's a side effect that's actually required, keep it. */ - if (nir_intrinsic_infos[intr->intrinsic].has_dest && - !list_is_empty(&intr->def.uses)) { - - bool *any = data; - *any = true; - return false; - } - - /* Otherwise, remove the dead instruction. */ - nir_instr_remove(&intr->instr); - return true; -} - -static bool -strip_side_effects_from_rast(nir_shader *s, bool *side_effects_for_rast) -{ - bool progress, any; - - /* Rather than complex analysis, clone and try to remove as many side effects - * as possible. Then we check if we removed them all. We need to loop to - * handle complex control flow with side effects, where we can strip - * everything but can't figure that out with a simple one-shot analysis. - */ - nir_shader *clone = nir_shader_clone(NULL, s); - - /* Drop as much as we can */ - do { - progress = false; - any = false; - NIR_PASS(progress, clone, nir_shader_intrinsics_pass, - strip_side_effect_from_rast, nir_metadata_control_flow, &any); - - NIR_PASS(progress, clone, nir_opt_dce); - NIR_PASS(progress, clone, nir_opt_dead_cf); - } while (progress); - - ralloc_free(clone); - - /* If we need atomics, leave them in */ - if (any) { - *side_effects_for_rast = true; - return false; - } - - /* Else strip it all */ - do { - progress = false; - any = false; - NIR_PASS(progress, s, nir_shader_intrinsics_pass, - strip_side_effect_from_rast, nir_metadata_control_flow, &any); - - NIR_PASS(progress, s, nir_opt_dce); - NIR_PASS(progress, s, nir_opt_dead_cf); - } while (progress); - - assert(!any); - return progress; -} - static bool strip_side_effect_from_main(nir_builder *b, nir_intrinsic_instr *intr, void *data) @@ -523,17 +574,42 @@ strip_side_effect_from_main(nir_builder *b, nir_intrinsic_instr *intr, switch (intr->intrinsic) { case nir_intrinsic_global_atomic: case nir_intrinsic_global_atomic_swap: - break; + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: + if (list_is_empty(&intr->def.uses)) { + nir_instr_remove(&intr->instr); + return true; + } + + return false; + + case nir_intrinsic_store_global: + case nir_intrinsic_image_store: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_fence_pbe_to_tex_agx: + if (data) { + nir_instr_remove(&intr->instr); + return true; + } + + return false; + default: return false; } +} - if (list_is_empty(&intr->def.uses)) { - nir_instr_remove(&intr->instr); - return true; - } - - return false; +/* + * The stream # is encoded into the lower bits of an index. The stream + * multiplier is the factor to multiply vertex IDs before adding the stream #. + */ +static unsigned +stream_multiplier(const nir_shader *gs) +{ + unsigned nr_streams = util_last_bit(gs->info.gs.active_stream_mask); + return util_next_power_of_two(nr_streams); } /* @@ -541,7 +617,7 @@ strip_side_effect_from_main(nir_builder *b, nir_intrinsic_instr *intr, * shades each rasterized output vertex in parallel. */ static nir_shader * -agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast, +agx_nir_create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state) { /* Don't muck up the original shader */ @@ -561,18 +637,27 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast, shader->info.name = "gs rast"; } - nir_builder b_ = - nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(shader))); - nir_builder *b = &b_; - - NIR_PASS(_, shader, strip_side_effects_from_rast, side_effects_for_rast); - /* Optimize out pointless gl_PointSize outputs. Bizarrely, these occur. */ if (shader->info.gs.output_primitive != MESA_PRIM_POINTS) shader->info.outputs_written &= ~VARYING_BIT_PSIZ; + nir_builder b_ = + nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(shader))); + nir_builder *b = &b_; + + const glsl_type *T = glsl_uint_type(); nir_def *raw_vertex_id = nir_load_vertex_id(b); - struct lower_gs_rast_state rs = {.raw_instance_id = nir_load_instance_id(b)}; + + struct lower_gs_rast_state rs = { + .raw_instance_id = nir_load_instance_id(b), + .points = gs->info.gs.output_primitive == MESA_PRIM_POINTS, + .stream = nir_umod_imm(b, raw_vertex_id, stream_multiplier(gs)), + .output_strip_length = nir_local_variable_create(b->impl, T, NULL), + .output_strip_base = nir_local_variable_create(b->impl, T, NULL), + .id_in_strip = nir_local_variable_create(b->impl, T, NULL), + }; + + raw_vertex_id = nir_udiv_imm(b, raw_vertex_id, stream_multiplier(gs)); switch (state->info->shape) { case AGX_GS_SHAPE_DYNAMIC_INDEXED: { @@ -633,6 +718,83 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast, nir_metadata_control_flow, &rs); b->cursor = nir_after_impl(b->impl); + if (gs->xfb_info) { + unsigned n_ = mesa_vertices_per_prim(gs->info.gs.output_primitive); + nir_def *zero = nir_imm_int(b, 0); + nir_def *strip_length = + rs.points ? zero : nir_load_var(b, rs.output_strip_length); + nir_def *id_in_strip = rs.points ? zero : nir_load_var(b, rs.id_in_strip); + nir_def *base = + rs.points ? rs.output_id : nir_load_var(b, rs.output_strip_base); + + struct nir_xfb_info *xfb = gs->xfb_info; + + nir_def *unrolled = nir_iadd( + b, nir_imul(b, rs.instance_id, load_geometry_param(b, gs_grid[0])), + rs.primitive_id); + + nir_def *n = nir_imm_int(b, n_); + + for (unsigned p_ = 0; p_ < n_; ++p_) { + nir_def *p = nir_imm_int(b, p_); + nir_push_if(b, libagx_xfb_vertex_copy_in_strip(b, n, id_in_strip, + strip_length, p)); + + /* Write XFB for each output */ + for (unsigned i = 0; i < xfb->output_count; ++i) { + nir_xfb_output_info output = xfb->outputs[i]; + unsigned stream = xfb->buffer_to_stream[output.buffer]; + nir_push_if(b, nir_ieq_imm(b, rs.stream, stream)); + + /* Get the index of this primitive in the XFB buffer. That is, the + * base for this invocation for the stream plus the offset within + * this invocation. + */ + nir_def *invocation_base = libagx_previous_xfb_primitives( + b, nir_load_geometry_param_buffer_agx(b), + nir_imm_int(b, state->static_count[stream]), + nir_imm_int(b, state->count_index[stream]), + nir_imm_int(b, state->info->count_words), + nir_imm_bool(b, state->info->prefix_sum), unrolled); + + nir_def *index = libagx_xfb_vertex_offset( + b, n, invocation_base, base, id_in_strip, p, + nir_inot(b, nir_i2b(b, nir_load_provoking_last(b)))); + + nir_def *xfb_verts = load_geometry_param(b, xfb_verts[stream]); + nir_push_if(b, nir_ult(b, index, xfb_verts)); + { + unsigned buffer = output.buffer; + unsigned stride = xfb->buffers[buffer].stride; + unsigned count = util_bitcount(output.component_mask); + + nir_variable *var = rs.selected.outputs[output.location]; + nir_def *value = + var ? nir_load_var(b, var) : nir_undef(b, 4, 32); + + /* In case output.component_mask contains invalid components, + * write out zeroes instead of blowing up validation. + * + * KHR-Single-GL44.enhanced_layouts.xfb_capture_inactive_output_component + * hits this. + */ + value = nir_pad_vector_imm_int(b, value, 0, 4); + + nir_def *addr = libagx_xfb_vertex_address( + b, nir_load_geometry_param_buffer_agx(b), index, + nir_imm_int(b, buffer), nir_imm_int(b, stride), + nir_imm_int(b, output.offset)); + + nir_store_global(b, addr, 4, + nir_channels(b, value, output.component_mask), + nir_component_mask(count)); + } + nir_pop_if(b, NULL); + nir_pop_if(b, NULL); + } + nir_pop_if(b, NULL); + } + } /* Forward each selected output to the rasterizer */ u_foreach_bit64(slot, shader->info.outputs_written) { @@ -647,15 +809,29 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast, if (slot == VARYING_SLOT_CLIP_DIST1) offset = 1; + /* We must only rasterize vertices from the rasterization stream. Since we + * shade vertices across all streams, we do this by throwing away vertices + * from non-rasterization streams (by setting a component to NaN). + */ + if (slot == VARYING_SLOT_POS && state->info->multistream) { + nir_def *rast_stream = nir_load_rasterization_stream(b); + nir_def *nan = nir_imm_float(b, NAN); + nir_def *killed = nir_vector_insert_imm(b, value, nan, 3); + + value = + nir_bcsel(b, nir_ieq(b, rs.stream, rast_stream), value, killed); + } + nir_store_output(b, value, nir_imm_int(b, offset), - .io_semantics.location = slot - offset, - .io_semantics.num_slots = 1, - .write_mask = nir_component_mask(value->num_components), - .src_type = nir_type_uint32); + .io_semantics.location = slot - offset); } - /* The geometry shader might not write point size - ensure it does. */ - if (gs->info.gs.output_primitive == MESA_PRIM_POINTS) { + /* The geometry shader might not write point size - ensure it does, if we're + * rasterizing at all. + */ + if (gs->info.gs.output_primitive == MESA_PRIM_POINTS && + (shader->info.outputs_written & VARYING_BIT_POS)) { + nir_lower_default_point_size(shader); } @@ -663,206 +839,46 @@ agx_nir_create_gs_rast_shader(const nir_shader *gs, bool *side_effects_for_rast, return shader; } -static void -lower_end_primitive(nir_builder *b, nir_intrinsic_instr *intr, - struct lower_gs_state *state) -{ - assert((intr->intrinsic == nir_intrinsic_set_vertex_and_primitive_count || - b->shader->info.gs.output_primitive != MESA_PRIM_POINTS) && - "endprimitive for points should've been removed"); - - /* The GS is the last stage before rasterization, so if we discard the - * rasterization, we don't output an index buffer, nothing will read it. - * Index buffer is only for the rasterization stream. - */ - unsigned stream = nir_intrinsic_stream_id(intr); - if (state->rasterizer_discard || stream != 0) - return; - - libagx_end_primitive( - b, load_geometry_param(b, output_index_buffer), intr->src[0].ssa, - intr->src[1].ssa, intr->src[2].ssa, - nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices), - calc_unrolled_index_id(b), - nir_imm_bool(b, b->shader->info.gs.output_primitive != MESA_PRIM_POINTS)); -} - -static void -write_xfb(nir_builder *b, struct lower_gs_state *state, unsigned stream, - nir_def *index_in_strip, nir_def *prim_id_in_invocation) -{ - struct nir_xfb_info *xfb = b->shader->xfb_info; - unsigned verts = nir_verts_in_output_prim(b->shader); - - /* Get the index of this primitive in the XFB buffer. That is, the base for - * this invocation for the stream plus the offset within this invocation. - */ - nir_def *invocation_base = libagx_previous_xfb_primitives( - b, nir_load_geometry_param_buffer_agx(b), - nir_imm_int(b, state->static_count[stream]), - nir_imm_int(b, state->count_index[stream]), - nir_imm_int(b, state->info->count_words), - nir_imm_bool(b, state->info->prefix_sum), calc_unrolled_id(b)); - - nir_def *prim_index = nir_iadd(b, invocation_base, prim_id_in_invocation); - nir_def *base_index = nir_imul_imm(b, prim_index, verts); - - nir_def *xfb_prims = load_geometry_param(b, xfb_prims[stream]); - nir_push_if(b, nir_ult(b, prim_index, xfb_prims)); - - /* Write XFB for each output */ - for (unsigned i = 0; i < xfb->output_count; ++i) { - nir_xfb_output_info output = xfb->outputs[i]; - - /* Only write to the selected stream */ - if (xfb->buffer_to_stream[output.buffer] != stream) - continue; - - unsigned buffer = output.buffer; - unsigned stride = xfb->buffers[buffer].stride; - unsigned count = util_bitcount(output.component_mask); - - for (unsigned vert = 0; vert < verts; ++vert) { - /* We write out the vertices backwards, since 0 is the current - * emitted vertex (which is actually the last vertex). - * - * We handle NULL var for - * KHR-Single-GL44.enhanced_layouts.xfb_capture_struct. - */ - unsigned v = (verts - 1) - vert; - nir_variable *var = state->outputs[output.location][v]; - nir_def *value = var ? nir_load_var(b, var) : nir_undef(b, 4, 32); - - /* In case output.component_mask contains invalid components, write - * out zeroes instead of blowing up validation. - * - * KHR-Single-GL44.enhanced_layouts.xfb_capture_inactive_output_component - * hits this. - */ - value = nir_pad_vector_imm_int(b, value, 0, 4); - - nir_def *rotated_vert = nir_imm_int(b, vert); - if (verts == 3) { - /* Map vertices for output so we get consistent winding order. For - * the primitive index, we use the index_in_strip. This is actually - * the vertex index in the strip, hence - * offset by 2 relative to the true primitive index (#2 for the - * first triangle in the strip, #3 for the second). That's ok - * because only the parity matters. - */ - rotated_vert = libagx_map_vertex_in_tri_strip( - b, index_in_strip, rotated_vert, - nir_inot(b, nir_i2b(b, nir_load_provoking_last(b)))); - } - - nir_def *addr = libagx_xfb_vertex_address( - b, nir_load_geometry_param_buffer_agx(b), base_index, rotated_vert, - nir_imm_int(b, buffer), nir_imm_int(b, stride), - nir_imm_int(b, output.offset)); - - nir_store_global(b, addr, 4, - nir_channels(b, value, output.component_mask), - nir_component_mask(count)); - } - } - - nir_pop_if(b, NULL); -} - -/* Handle transform feedback for a given emit_vertex_with_counter */ -static void -lower_emit_vertex_xfb(nir_builder *b, nir_intrinsic_instr *intr, - struct lower_gs_state *state) -{ - /* Transform feedback is written for each decomposed output primitive. Since - * we're writing strips, that means we output XFB for each vertex after the - * first complete primitive is formed. - */ - unsigned first_prim = nir_verts_in_output_prim(b->shader) - 1; - nir_def *index_in_strip = intr->src[1].ssa; - - nir_push_if(b, nir_uge_imm(b, index_in_strip, first_prim)); - { - write_xfb(b, state, nir_intrinsic_stream_id(intr), index_in_strip, - intr->src[3].ssa); - } - nir_pop_if(b, NULL); - - /* Transform feedback writes out entire primitives during the emit_vertex. To - * do that, we store the values at all vertices in the strip in a little ring - * buffer. Index #0 is always the most recent primitive (so non-XFB code can - * just grab index #0 without any checking). Index #1 is the previous vertex, - * and index #2 is the vertex before that. Now that we've written XFB, since - * we've emitted a vertex we need to cycle the ringbuffer, freeing up index - * #0 for the next vertex that we are about to emit. We do that by copying - * the first n - 1 vertices forward one slot, which has to happen with a - * backwards copy implemented here. - * - * If we're lucky, all of these copies will be propagated away. If we're - * unlucky, this involves at most 2 copies per component per XFB output per - * vertex. - */ - u_foreach_bit64(slot, b->shader->info.outputs_written) { - /* Note: if we're outputting points, nir_verts_in_output_prim will be 1, - * so this loop will not execute. This is intended: points are - * self-contained primitives and do not need these copies. - */ - for (int v = nir_verts_in_output_prim(b->shader) - 1; v >= 1; --v) { - nir_def *value = nir_load_var(b, state->outputs[slot][v - 1]); - - nir_store_var(b, state->outputs[slot][v], value, - nir_component_mask(value->num_components)); - } - } -} - static bool -lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state) +lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_) { b->cursor = nir_before_instr(&intr->instr); - struct lower_gs_state *state_ = state; + struct lower_gs_state *state = state_; switch (intr->intrinsic) { case nir_intrinsic_set_vertex_and_primitive_count: { - if (state_->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED) + if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED) break; - /* Points write their index buffer here, other primitives write on end. We - * also pad the index buffer here for the rasterization stream. - */ - if (b->shader->info.gs.output_primitive == MESA_PRIM_POINTS) { - lower_end_primitive(b, intr, state); - } - - if (nir_intrinsic_stream_id(intr) == 0 && !state_->rasterizer_discard) { - libagx_pad_index_gs(b, load_geometry_param(b, output_index_buffer), - intr->src[0].ssa, intr->src[1].ssa, - calc_unrolled_id(b), - nir_imm_int(b, state_->info->max_indices)); + /* All streams are merged, just pick a single instruction */ + if (nir_intrinsic_stream_id(intr) == 0) { + libagx_pad_index_gs( + b, load_geometry_param(b, output_index_buffer), + nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices), + intr->src[1].ssa, nir_imm_int(b, state->info->max_indices)); } break; } - case nir_intrinsic_end_primitive_with_counter: { - if (state_->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED) + case nir_intrinsic_emit_primitive_poly: { + if (state->info->shape != AGX_GS_SHAPE_DYNAMIC_INDEXED) break; - unsigned min = nir_verts_in_output_prim(b->shader); - - /* We only write out complete primitives */ - nir_push_if(b, nir_uge_imm(b, intr->src[1].ssa, min)); - { - lower_end_primitive(b, intr, state); - } - nir_pop_if(b, NULL); + libagx_write_strip( + b, load_geometry_param(b, output_index_buffer), + nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices), + intr->src[0].ssa, + nir_iadd(b, calc_unrolled_index_id(b), intr->src[1].ssa), + intr->src[2].ssa, + nir_imm_ivec3(b, nir_intrinsic_stream_id(intr), + stream_multiplier(b->shader), + nir_verts_in_output_prim(b->shader))); break; } - case nir_intrinsic_emit_vertex_with_counter: - /* emit_vertex triggers transform feedback but is otherwise a no-op. */ - if (b->shader->xfb_info) - lower_emit_vertex_xfb(b, intr, state); + case nir_intrinsic_store_output: + case nir_intrinsic_select_vertex_poly: break; default: @@ -1012,24 +1028,14 @@ agx_nir_lower_gs_instancing(nir_shader *gs) } static unsigned -calculate_max_indices(enum mesa_prim prim, unsigned verts, signed static_verts, - signed static_prims) +calculate_max_indices(enum mesa_prim prim, unsigned verts) { - /* We always have a static max_vertices, but we might have a tighter bound. - * Use it if we have one - */ - if (static_verts >= 0) { - verts = MIN2(verts, static_verts); - } - /* Points do not need primitive count added. Other topologies do. If we have * a static primitive count, we use that. Otherwise, we use a worst case * estimate that primitives are emitted one-by-one. */ if (prim == MESA_PRIM_POINTS) return verts; - else if (static_prims >= 0) - return verts + static_prims; else return verts + (verts / mesa_vertices_per_prim(prim)); } @@ -1042,27 +1048,14 @@ struct topology_ctx { static bool evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data) { - bool points = b->shader->info.gs.output_primitive == MESA_PRIM_POINTS; - bool end_prim = intr->intrinsic == nir_intrinsic_end_primitive_with_counter; - bool set_prim = - intr->intrinsic == nir_intrinsic_set_vertex_and_primitive_count; - struct topology_ctx *ctx = data; struct agx_gs_info *info = ctx->info; - if (!(set_prim && points) && !end_prim) + if (intr->intrinsic != nir_intrinsic_emit_primitive_poly) return false; - assert(!(end_prim && points) && "should have been deleted"); - - /* Only consider the rasterization stream. */ - if (nir_intrinsic_stream_id(intr) != 0) - return false; - - /* All end primitives must be executed exactly once. That happens if - * everything is in the start block. - * - * Strictly we could relax this (to handle if-statements interleaved with - * other stuff). + /* All emit-primitives must execute exactly once. That happens if everything + * is in the start block. Strictly we could relax this (to handle + * if-statements interleaved with other stuff). */ if (intr->instr.block != nir_start_block(b->impl)) { info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED; @@ -1077,30 +1070,27 @@ evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data) return false; } - unsigned min = nir_verts_in_output_prim(b->shader); - - if (nir_src_as_uint(intr->src[1]) >= min) { - _libagx_end_primitive(ctx->topology, nir_src_as_uint(intr->src[0]), - nir_src_as_uint(intr->src[1]), - nir_src_as_uint(intr->src[2]), 0, 0, !points); - } - + _libagx_write_strip( + ctx->topology, nir_src_as_uint(intr->src[0]), + nir_src_as_uint(intr->src[1]), nir_src_as_uint(intr->src[2]), + nir_intrinsic_stream_id(intr), stream_multiplier(b->shader), + nir_verts_in_output_prim(b->shader)); return false; } /* * Pattern match the index buffer with restart against a list topology: * - * 0, 1, 2, -1, 3, 4, 5, -1, ... + * 0, 1, 2, -1, 3, 4, 5, ... */ static bool match_list_topology(struct agx_gs_info *info, uint32_t count, - uint32_t *topology) + uint32_t *topology, bool has_restart) { - unsigned count_with_restart = count + 1; + unsigned count_with_restart = count + has_restart; - /* Must be an integer number of primitives */ - if (info->max_indices % count_with_restart) + /* Must be an integer number of primitives. Last restart is dropped. */ + if ((info->max_indices + has_restart) % count_with_restart) return false; /* Must match the list topology */ @@ -1115,7 +1105,8 @@ match_list_topology(struct agx_gs_info *info, uint32_t count, /* If we match, rewrite the topology and drop indexing */ info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE; info->mode = u_decomposed_prim(info->mode); - info->max_indices = (info->max_indices / count_with_restart) * count; + info->max_indices = + ((info->max_indices + has_restart) / count_with_restart) * count; return true; } @@ -1151,24 +1142,20 @@ static void optimize_static_topology(struct agx_gs_info *info, nir_shader *gs) { struct topology_ctx ctx = {.info = info}; + bool has_restart = info->mode != MESA_PRIM_POINTS; nir_shader_intrinsics_pass(gs, evaluate_topology, nir_metadata_all, &ctx); if (info->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) return; - /* Points are always lists */ - if (gs->info.gs.output_primitive == MESA_PRIM_POINTS) { - info->shape = AGX_GS_SHAPE_STATIC_PER_INSTANCE; - return; - } + /* We can always drop the trailing restart index */ + if (has_restart && info->max_indices) + info->max_indices--; /* Try to pattern match a list topology */ unsigned count = nir_verts_in_output_prim(gs); - if (match_list_topology(info, count, ctx.topology)) + if (match_list_topology(info, count, ctx.topology, has_restart)) return; - /* Instancing means we can always drop the trailing restart index */ - info->max_indices--; - /* Try to pattern match a strip topology */ if (is_strip_topology(ctx.topology, info->max_indices)) { info->shape = AGX_GS_SHAPE_STATIC_PER_PRIM; @@ -1178,6 +1165,8 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs) /* Otherwise, use a small static index buffer. There's no theoretical reason * to bound this, but we want small serialized shader info structs. We assume * that large static index buffers are rare and hence fall back to dynamic. + * + * XXX: check if this holds with streams. */ if (info->max_indices >= ARRAY_SIZE(info->topology)) { info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED; @@ -1193,9 +1182,8 @@ optimize_static_topology(struct agx_gs_info *info, nir_shader *gs) } bool -agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count, - nir_shader **gs_copy, nir_shader **pre_gs, - struct agx_gs_info *info) +agx_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy, + nir_shader **pre_gs, struct agx_gs_info *info) { /* Lower I/O as assumed by the rest of GS lowering */ if (gs->xfb_info != NULL) { @@ -1232,13 +1220,7 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count, /* Lower geometry shader writes to contain all of the required counts, so we * know where in the various buffers we should write vertices. */ - NIR_PASS(_, gs, nir_lower_gs_intrinsics, - nir_lower_gs_intrinsics_count_primitives | - nir_lower_gs_intrinsics_per_stream | - nir_lower_gs_intrinsics_count_vertices_per_primitive | - nir_lower_gs_intrinsics_overwrite_incomplete | - nir_lower_gs_intrinsics_always_end_primitive | - nir_lower_gs_intrinsics_count_decomposed_primitives); + NIR_PASS(_, gs, agx_nir_lower_gs_intrinsics); /* Clean up after all that lowering we did */ bool progress = false; @@ -1265,19 +1247,17 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count, /* If we know counts at compile-time we can simplify, so try to figure out * the counts statically. */ - struct lower_gs_state gs_state = { - .rasterizer_discard = rasterizer_discard, - .info = info, - }; + struct lower_gs_state gs_state = {.info = info}; *info = (struct agx_gs_info){ .mode = gs->info.gs.output_primitive, .xfb = gs->xfb_info != NULL, .shape = -1, + .multistream = gs->info.gs.active_stream_mask & ~1, }; - int static_vertices[4] = {0}, static_primitives[4] = {0}; - nir_gs_count_vertices_and_primitives(gs, static_vertices, static_primitives, + int static_indices[4] = {0}; + nir_gs_count_vertices_and_primitives(gs, NULL, static_indices, gs_state.static_count, 4); /* Anything we don't know statically will be tracked by the count buffer. @@ -1289,21 +1269,21 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count, } /* Using the gathered static counts, choose the index buffer stride. */ - info->max_indices = calculate_max_indices( - gs->info.gs.output_primitive, gs->info.gs.vertices_out, - static_vertices[0], static_primitives[0]); + info->max_indices = static_indices[0]; + if (static_indices[0] < 0) { + info->max_indices = calculate_max_indices(gs->info.gs.output_primitive, + gs->info.gs.vertices_out); + } info->prefix_sum = info->count_words > 0 && gs->xfb_info != NULL; - if (static_vertices[0] >= 0 && static_primitives[0] >= 0) { + if (static_indices[0] >= 0) { optimize_static_topology(info, gs); } else { info->shape = AGX_GS_SHAPE_DYNAMIC_INDEXED; } - bool side_effects_for_rast = false; - *gs_copy = - agx_nir_create_gs_rast_shader(gs, &side_effects_for_rast, &gs_state); + *gs_copy = agx_nir_create_gs_rast_shader(gs, &gs_state); NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id, nir_metadata_control_flow, NULL); @@ -1320,44 +1300,20 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count, else *gs_count = NULL; - /* Geometry shader outputs are staged to temporaries */ - struct lower_output_to_var_state state = {0}; + /* Strip stores and atomics */ + do { + progress = false; + NIR_PASS(progress, gs, nir_shader_intrinsics_pass, + strip_side_effect_from_main, nir_metadata_control_flow, + (void *)true); - u_foreach_bit64(slot, gs->info.outputs_written) { - /* After enough optimizations, the shader metadata can go out of sync, fix - * with our gathered info. Otherwise glsl_vector_type will assert fail. - */ - if (component_counts[slot] == 0) { - gs->info.outputs_written &= ~BITFIELD64_BIT(slot); - continue; - } - - const char *slot_name = - gl_varying_slot_name_for_stage(slot, MESA_SHADER_GEOMETRY); - - for (unsigned i = 0; i < MAX_PRIM_OUT_SIZE; ++i) { - gs_state.outputs[slot][i] = nir_variable_create( - gs, nir_var_shader_temp, - glsl_vector_type(GLSL_TYPE_UINT, component_counts[slot]), - ralloc_asprintf(gs, "%s-%u", slot_name, i)); - } - - state.outputs[slot] = gs_state.outputs[slot][0]; - } - - NIR_PASS(_, gs, nir_shader_instructions_pass, lower_output_to_var, - nir_metadata_control_flow, &state); + NIR_PASS(progress, gs, nir_opt_dce); + NIR_PASS(progress, gs, nir_opt_dead_cf); + } while (progress); NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_instr, nir_metadata_none, &gs_state); - /* Determine if we are guaranteed to rasterize at least one vertex, so that - * we can strip the prepass of side effects knowing they will execute in the - * rasterization shader. - */ - bool rasterizes_at_least_one_vertex = - !rasterizer_discard && static_vertices[0] > 0; - /* Clean up after all that lowering we did */ nir_lower_global_vars_to_local(gs); do { @@ -1376,17 +1332,16 @@ agx_nir_lower_gs(nir_shader *gs, bool rasterizer_discard, nir_shader **gs_count, } while (progress); - /* When rasterizing, we try to handle side effects sensibly. */ - if (rasterizes_at_least_one_vertex && side_effects_for_rast) { - do { - progress = false; - NIR_PASS(progress, gs, nir_shader_intrinsics_pass, - strip_side_effect_from_main, nir_metadata_control_flow, NULL); + /* Strip remaining atomics, but not stores - since those are from us */ + do { + progress = false; + NIR_PASS(progress, gs, nir_shader_intrinsics_pass, + strip_side_effect_from_main, nir_metadata_control_flow, + (void *)false); - NIR_PASS(progress, gs, nir_opt_dce); - NIR_PASS(progress, gs, nir_opt_dead_cf); - } while (progress); - } + NIR_PASS(progress, gs, nir_opt_dce); + NIR_PASS(progress, gs, nir_opt_dead_cf); + } while (progress); /* All those variables we created should've gone away by now */ NIR_PASS(_, gs, nir_remove_dead_variables, nir_var_function_temp, NULL); diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h index fc3080f1e6d..e29705a9491 100644 --- a/src/asahi/lib/agx_nir_lower_gs.h +++ b/src/asahi/lib/agx_nir_lower_gs.h @@ -38,6 +38,9 @@ struct agx_gs_info { /* Whether a prefix sum is required on the count outputs. Implies xfb */ bool prefix_sum; + /* Whether the GS writes to a stream other than stream #0 */ + bool multistream; + /* Shape of the rasterization draw, named by the instance ID */ enum agx_gs_shape shape; @@ -45,9 +48,9 @@ struct agx_gs_info { uint8_t topology[64]; }; -bool agx_nir_lower_gs(struct nir_shader *gs, bool rasterizer_discard, - struct nir_shader **gs_count, struct nir_shader **gs_copy, - struct nir_shader **pre_gs, struct agx_gs_info *info); +bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader **gs_count, + struct nir_shader **gs_copy, struct nir_shader **pre_gs, + struct agx_gs_info *info); bool agx_nir_lower_tcs(struct nir_shader *tcs); diff --git a/src/asahi/libagx/geometry.cl b/src/asahi/libagx/geometry.cl index f72352339ea..1798f9e1dee 100644 --- a/src/asahi/libagx/geometry.cl +++ b/src/asahi/libagx/geometry.cl @@ -13,11 +13,12 @@ #include "query.h" #include "tessellator.h" -/* Swap the two non-provoking vertices third vert in odd triangles. This - * generates a vertex ID list with a consistent winding order. +/* Swap the two non-provoking vertices in odd triangles. This generates a vertex + * ID list with a consistent winding order. * - * With prim and flatshade_first, the map : [0, 1, 2] -> [0, 1, 2] is its own - * inverse. This lets us reuse it for both vertex fetch and transform feedback. + * Holding prim and flatshade_first constant, the map : [0, 1, 2] -> [0, 1, 2] + * is its own inverse. It is hence used both vertex fetch and transform + * feedback. */ uint libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first) @@ -30,12 +31,49 @@ libagx_map_vertex_in_tri_strip(uint prim, uint vert, bool flatshade_first) return (provoking || even) ? vert : ((3 - pv) - vert); } -uint64_t -libagx_xfb_vertex_address(global struct agx_geometry_params *p, uint base_index, - uint vert, uint buffer, uint stride, - uint output_offset) +static inline uint +xfb_prim(uint id, uint n, uint copy) +{ + return sub_sat(id, n - 1u) + copy; +} + +/* + * Determine whether an output vertex has an n'th copy in the transform feedback + * buffer. This is written weirdly to let constant folding remove unnecessary + * stores when length is known statically. + */ +bool +libagx_xfb_vertex_copy_in_strip(uint n, uint id, uint length, uint copy) +{ + uint prim = xfb_prim(id, n, copy); + + int num_prims = length - (n - 1); + return copy == 0 || (prim < num_prims && id >= copy && copy < num_prims); +} + +uint +libagx_xfb_vertex_offset(uint n, uint invocation_base_prim, + uint strip_base_prim, uint id_in_strip, uint copy, + bool flatshade_first) +{ + uint prim = xfb_prim(id_in_strip, n, copy); + uint vert_0 = min(id_in_strip, n - 1); + uint vert = vert_0 - copy; + + if (n == 3) { + vert = libagx_map_vertex_in_tri_strip(prim, vert, flatshade_first); + } + + /* Tally up in the whole buffer */ + uint base_prim = invocation_base_prim + strip_base_prim; + uint base_vertex = base_prim * n; + return base_vertex + (prim * n) + vert; +} + +uint64_t +libagx_xfb_vertex_address(constant struct agx_geometry_params *p, uint index, + uint buffer, uint stride, uint output_offset) { - uint index = base_index + vert; uint xfb_offset = (index * stride) + output_offset; return (uintptr_t)(p->xfb_base[buffer]) + xfb_offset; @@ -572,20 +610,20 @@ libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i, } void -libagx_end_primitive(global uint32_t *index_buffer, uint total_verts, - uint verts_in_prim, uint total_prims, uint index_offs, - uint geometry_base, bool restart) +libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t inv_index_offset, + uint32_t prim_index_offset, uint32_t vertex_offset, + uint32_t verts_in_prim, uint3 info) { - _libagx_end_primitive(index_buffer, total_verts, verts_in_prim, total_prims, - index_offs, geometry_base, restart); + _libagx_write_strip(index_buffer, inv_index_offset + prim_index_offset, + vertex_offset, verts_in_prim, info.x, info.y, info.z); } void -libagx_pad_index_gs(global int *index_buffer, uint total_verts, - uint total_prims, uint id, uint alloc) +libagx_pad_index_gs(global int *index_buffer, uint inv_index_offset, + uint nr_indices, uint alloc) { - for (uint i = total_verts + total_prims; i < alloc; ++i) { - index_buffer[(id * alloc) + i] = -1; + for (uint i = nr_indices; i < alloc; ++i) { + index_buffer[inv_index_offset + i] = -1; } } @@ -888,7 +926,7 @@ libagx_pre_gs(global struct agx_geometry_params *p, uint streams, int4 overflow = prims < in_prims; libagx_foreach_xfb(streams, i) { - p->xfb_prims[i] = prims[i]; + p->xfb_verts[i] = prims[i] * vertices_per_prim; *(p->xfb_overflow[i]) += (bool)overflow[i]; *(p->xfb_prims_generated_counter[i]) += prims[i]; diff --git a/src/asahi/libagx/geometry.h b/src/asahi/libagx/geometry.h index 54ef991396b..a992b2ee392 100644 --- a/src/asahi/libagx/geometry.h +++ b/src/asahi/libagx/geometry.h @@ -227,10 +227,10 @@ struct agx_geometry_params { uint32_t xfb_size[MAX_SO_BUFFERS]; - /* Number of primitives emitted by transform feedback per stream. Written by + /* Number of vertices emitted by transform feedback per stream. Written by * the pre-GS program. */ - uint32_t xfb_prims[MAX_VERTEX_STREAMS]; + uint32_t xfb_verts[MAX_VERTEX_STREAMS]; /* Within an indirect GS draw, the grids used to dispatch the VS/GS written * out by the GS indirect setup kernel or the CPU for a direct draw. This is @@ -381,38 +381,26 @@ libagx_uncompact_prim(uint packed) } /* - * Translate EndPrimitive for LINE_STRIP or TRIANGLE_STRIP output prims into - * writes into the 32-bit output index buffer. We write the sequence (b, b + 1, - * b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n - * (count) is the number of verts in the prims, and -1 is the prim restart index - * used to signal the end of the prim. + * Write a strip into a 32-bit index buffer. This is the sequence: * - * For points, we write index buffers without restart, just as a sideband to - * pass data into the vertex shader. + * (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index + * + * For points, we write index buffers without restart just for remapping. */ static inline void -_libagx_end_primitive(GLOBAL uint32_t *index_buffer, uint32_t total_verts, - uint32_t verts_in_prim, uint32_t total_prims, - uint32_t index_offs, uint32_t geometry_base, bool restart) +_libagx_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset, + uint32_t vertex_offset, uint32_t verts_in_prim, + uint32_t stream, uint32_t stream_multiplier, uint32_t n) { - /* Previous verts/prims are from previous invocations plus earlier - * prims in this invocation. For the intra-invocation counts, we - * subtract the count for this prim from the inclusive sum NIR gives us. - */ - uint32_t previous_verts_in_invoc = (total_verts - verts_in_prim); - uint32_t previous_verts = previous_verts_in_invoc; - uint32_t previous_prims = restart ? (total_prims - 1) : 0; + bool restart = n > 1; + if (verts_in_prim < n) + return; - /* The indices are encoded as: (unrolled ID * output vertices) + vertex. */ - uint32_t index_base = geometry_base + previous_verts_in_invoc; - - /* Index buffer contains 1 index for each vertex and 1 for each prim */ - GLOBAL uint32_t *out = - &index_buffer[index_offs + previous_verts + previous_prims]; + GLOBAL uint32_t *out = &index_buffer[index_offset]; /* Write out indices for the strip */ for (uint32_t i = 0; i < verts_in_prim; ++i) { - out[i] = index_base + i; + out[i] = (vertex_offset + i) * stream_multiplier + stream; } if (restart) diff --git a/src/asahi/vulkan/hk_cmd_buffer.h b/src/asahi/vulkan/hk_cmd_buffer.h index 7de4187f57d..0c058ffe6b7 100644 --- a/src/asahi/vulkan/hk_cmd_buffer.h +++ b/src/asahi/vulkan/hk_cmd_buffer.h @@ -93,6 +93,9 @@ struct hk_root_descriptor_table { uint16_t api_gs; uint16_t _pad5; + uint16_t rasterization_stream; + uint16_t _pad6; + /* Mapping from varying slots written by the last vertex stage to UVS * indices. This mapping must be compatible with the fragment shader. */ diff --git a/src/asahi/vulkan/hk_cmd_draw.c b/src/asahi/vulkan/hk_cmd_draw.c index 07d42bb8dd5..eff4394464a 100644 --- a/src/asahi/vulkan/hk_cmd_draw.c +++ b/src/asahi/vulkan/hk_cmd_draw.c @@ -139,6 +139,22 @@ vk_conv_topology(VkPrimitiveTopology topology) } } +static bool +hk_rast_discard(struct hk_cmd_buffer *cmd) +{ + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; + + /* A non-zero rasterization stream acts as a rasterizer discard unless + * there's a multistream geometry shader bound. + */ + if (dyn->rs.rasterization_stream != 0) { + struct hk_api_shader *gs = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY]; + return !gs || !gs->variants[HK_GS_VARIANT_COUNT].info.gs.multistream; + } + + return dyn->rs.rasterizer_discard_enable; +} + static void hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd) { @@ -1111,13 +1127,10 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) { struct hk_device *dev = hk_cmd_buffer_device(cmd); struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors; - struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; struct hk_graphics_state *gfx = &cmd->state.gfx; struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY]; struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]); - - bool rast_disc = dyn->rs.rasterizer_discard_enable; - struct hk_shader *count = hk_count_gs_variant(gs, rast_disc); + struct hk_shader *count = hk_count_gs_variant(gs); /* XXX: We should deduplicate this logic */ bool indirect = agx_is_indirect(draw.b) || @@ -1197,6 +1210,9 @@ hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct agx_draw draw) params.vs_grid[2] = params.gs_grid[2] = 1; if (gsi->shape == AGX_GS_SHAPE_DYNAMIC_INDEXED) { + /* Need to allocate heap if we haven't yet */ + hk_heap(cmd); + cmd->geom_index_buffer = dev->heap->va->addr; cmd->geom_index_count = dev->heap->size; } else { @@ -1455,13 +1471,10 @@ hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY]; struct agx_grid grid_vs, grid_gs; - struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; - bool rast_disc = dyn->rs.rasterizer_discard_enable; - struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx); - struct hk_shader *main = hk_main_gs_variant(gs, rast_disc); - struct hk_shader *count = hk_count_gs_variant(gs, rast_disc); - struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc); + struct hk_shader *main = hk_main_gs_variant(gs); + struct hk_shader *count = hk_count_gs_variant(gs); + struct hk_shader *pre_gs = hk_pre_gs_variant(gs); uint64_t geometry_params = desc->root.draw.geometry_params; unsigned count_words = count->info.gs.count_words; @@ -1727,9 +1740,11 @@ hk_flush_shaders(struct hk_cmd_buffer *cmd) /* Geometry shading overrides the restart index, reemit on rebind */ if (IS_SHADER_DIRTY(GEOMETRY)) { + struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state; struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY]; desc->root.draw.api_gs = gs && !gs->is_passthrough; + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE); } struct hk_shader *hw_vs = hk_bound_hw_vs(gfx); @@ -2405,7 +2420,7 @@ hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out) agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg, linked_fs->b.fragment_control) { - cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable; + cfg.tag_write_disable = hk_rast_discard(cmd); } } @@ -2496,7 +2511,7 @@ hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out) } cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking); - cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable; + cfg.rasterizer_discard = hk_rast_discard(cmd); /* We do not support unrestricted depth, so clamping is inverted from * clipping. This implementation seems to pass CTS without unrestricted @@ -2650,6 +2665,13 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, desc->root_dirty = true; } + if (IS_DIRTY(RS_RASTERIZATION_STREAM)) { + desc->root.draw.rasterization_stream = dyn->rs.rasterization_stream; + desc->root_dirty = true; + + BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE); + } + if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) || IS_DIRTY(DS_DEPTH_COMPARE_OP)) { @@ -3131,7 +3153,7 @@ hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) { cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking); - cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable; + cfg.unknown_4 = cfg.unknown_5 = hk_rast_discard(cmd); cfg.generate_primitive_id = gfx->generate_primitive_id; } @@ -3562,14 +3584,6 @@ hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct agx_draw draw_) if (geom) { draw = hk_launch_gs_prerast(cmd, ccs, draw); - - /* We must not draw if the app specified rasterizer discard. This is - * required for both performance (it is pointless to rasterize and - * there are no side effects), but also correctness (no indirect draw - * descriptor will be filled out). - */ - if (dyn->rs.rasterizer_discard_enable) - continue; } if (adj) { diff --git a/src/asahi/vulkan/hk_nir_lower_descriptors.c b/src/asahi/vulkan/hk_nir_lower_descriptors.c index c3d92c44f3c..e7c6b083fc4 100644 --- a/src/asahi/vulkan/hk_nir_lower_descriptors.c +++ b/src/asahi/vulkan/hk_nir_lower_descriptors.c @@ -416,6 +416,9 @@ lower_uvs_index(nir_builder *b, nir_intrinsic_instr *intrin, void *data) case nir_intrinsic_load_tess_param_buffer_agx: return lower_sysval_to_root_table(b, intrin, draw.tess_params); + case nir_intrinsic_load_rasterization_stream: + return lower_sysval_to_root_table(b, intrin, draw.rasterization_stream); + case nir_intrinsic_load_is_first_fan_agx: { unsigned offset = hk_root_descriptor_offset(draw.provoking); b->cursor = nir_instr_remove(&intrin->instr); diff --git a/src/asahi/vulkan/hk_shader.c b/src/asahi/vulkan/hk_shader.c index 0f77fc1ad32..a170d473658 100644 --- a/src/asahi/vulkan/hk_shader.c +++ b/src/asahi/vulkan/hk_shader.c @@ -1276,67 +1276,44 @@ hk_compile_shader(struct hk_device *dev, struct vk_shader_compile_info *info, /* Compile all variants up front */ if (sw_stage == MESA_SHADER_GEOMETRY) { - for (unsigned rast_disc = 0; rast_disc < 2; ++rast_disc) { - struct hk_shader *main_variant = hk_main_gs_variant(obj, rast_disc); - struct hk_shader *count_variant = hk_count_gs_variant(obj, rast_disc); - bool last = (rast_disc + 1) == 2; + struct hk_shader *main_variant = hk_main_gs_variant(obj); + struct hk_shader *count_variant = hk_count_gs_variant(obj); - /* Each variant gets its own NIR. To save an extra clone, we use the - * original NIR for the last stage. - */ - nir_shader *clone = last ? nir : nir_shader_clone(NULL, nir); - nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL; + nir_shader *count = NULL, *rast = NULL, *pre_gs = NULL; - NIR_PASS(_, clone, agx_nir_lower_gs, rast_disc, &count, &rast, &pre_gs, - &count_variant->info.gs); + NIR_PASS(_, nir, agx_nir_lower_gs, &count, &rast, &pre_gs, + &count_variant->info.gs); - if (!rast_disc) { - struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST]; + struct hk_shader *shader = &obj->variants[HK_GS_VARIANT_RAST]; + hk_lower_hw_vs(rast, shader, features); + shader->info.gs = count_variant->info.gs; + main_variant->info.gs = count_variant->info.gs; - hk_lower_hw_vs(rast, shader, features); - shader->info.gs = count_variant->info.gs; - } + struct { + nir_shader *in; + struct hk_shader *out; + } variants[] = { + {nir, hk_main_gs_variant(obj)}, + {pre_gs, hk_pre_gs_variant(obj)}, + {count, count_variant}, + {rast, &obj->variants[HK_GS_VARIANT_RAST]}, + }; - main_variant->info.gs = count_variant->info.gs; + for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) { + if (variants[v].in) { + result = hk_compile_nir( + dev, pAllocator, variants[v].in, info->flags, info->robustness, + NULL, features, variants[v].out, sw_stage, true, NULL); - struct { - nir_shader *in; - struct hk_shader *out; - } variants[] = { - {clone, hk_main_gs_variant(obj, rast_disc)}, - {pre_gs, hk_pre_gs_variant(obj, rast_disc)}, - {count, count_variant}, - {rast_disc ? NULL : rast, &obj->variants[HK_GS_VARIANT_RAST]}, - }; - - for (unsigned v = 0; v < ARRAY_SIZE(variants); ++v) { - if (variants[v].in) { - result = - hk_compile_nir(dev, pAllocator, variants[v].in, info->flags, - info->robustness, NULL, features, - variants[v].out, sw_stage, true, NULL); - if (result != VK_SUCCESS) { - hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); - if (clone != nir) { - ralloc_free(nir); - } - - ralloc_free(clone); - ralloc_free(pre_gs); - ralloc_free(count); - ralloc_free(rast); - return result; - } + if (result != VK_SUCCESS) { + hk_api_shader_destroy(&dev->vk, &obj->vk, pAllocator); + ralloc_free(nir); + ralloc_free(pre_gs); + ralloc_free(count); + ralloc_free(rast); + return result; } } - - /* Nothing consumes this otherwise throw it away. - * - * TODO: We should just not generate it. - */ - if (rast_disc) { - ralloc_free(rast); - } } } else if (sw_stage == MESA_SHADER_VERTEX || sw_stage == MESA_SHADER_TESS_EVAL) { diff --git a/src/asahi/vulkan/hk_shader.h b/src/asahi/vulkan/hk_shader.h index a7dac8d3f79..fcade1cba1e 100644 --- a/src/asahi/vulkan/hk_shader.h +++ b/src/asahi/vulkan/hk_shader.h @@ -183,15 +183,12 @@ enum hk_gs_variant { /* Main compute shader */ HK_GS_VARIANT_MAIN, - HK_GS_VARIANT_MAIN_NO_RAST, /* Count compute shader */ HK_GS_VARIANT_COUNT, - HK_GS_VARIANT_COUNT_NO_RAST, /* Pre-GS compute shader */ HK_GS_VARIANT_PRE, - HK_GS_VARIANT_PRE_NO_RAST, HK_GS_VARIANTS, }; @@ -200,11 +197,8 @@ enum hk_gs_variant { static const char *hk_gs_variant_name[] = { [HK_GS_VARIANT_RAST] = "Rasterization", [HK_GS_VARIANT_MAIN] = "Main", - [HK_GS_VARIANT_MAIN_NO_RAST] = "Main (rast. discard)", [HK_GS_VARIANT_COUNT] = "Count", - [HK_GS_VARIANT_COUNT_NO_RAST] = "Count (rast. discard)", [HK_GS_VARIANT_PRE] = "Pre-GS", - [HK_GS_VARIANT_PRE_NO_RAST] = "Pre-GS (rast. discard)", }; /* clang-format on */ @@ -280,21 +274,21 @@ hk_any_variant(struct hk_api_shader *obj) } static struct hk_shader * -hk_main_gs_variant(struct hk_api_shader *obj, bool rast_disc) +hk_main_gs_variant(struct hk_api_shader *obj) { - return &obj->variants[HK_GS_VARIANT_MAIN + rast_disc]; + return &obj->variants[HK_GS_VARIANT_MAIN]; } static struct hk_shader * -hk_count_gs_variant(struct hk_api_shader *obj, bool rast_disc) +hk_count_gs_variant(struct hk_api_shader *obj) { - return &obj->variants[HK_GS_VARIANT_COUNT + rast_disc]; + return &obj->variants[HK_GS_VARIANT_COUNT]; } static struct hk_shader * -hk_pre_gs_variant(struct hk_api_shader *obj, bool rast_disc) +hk_pre_gs_variant(struct hk_api_shader *obj) { - return &obj->variants[HK_GS_VARIANT_PRE + rast_disc]; + return &obj->variants[HK_GS_VARIANT_PRE]; } #define HK_MAX_LINKED_USC_SIZE \ diff --git a/src/gallium/drivers/asahi/agx_disk_cache.c b/src/gallium/drivers/asahi/agx_disk_cache.c index 864f7ed5545..f3acee12c6c 100644 --- a/src/gallium/drivers/asahi/agx_disk_cache.c +++ b/src/gallium/drivers/asahi/agx_disk_cache.c @@ -37,15 +37,10 @@ agx_disk_cache_compute_key(struct disk_cache *cache, if (uncompiled->type == PIPE_SHADER_VERTEX || uncompiled->type == PIPE_SHADER_TESS_EVAL) key_size = sizeof(shader_key->vs); - else if (uncompiled->type == PIPE_SHADER_GEOMETRY) - key_size = sizeof(shader_key->gs); else if (uncompiled->type == PIPE_SHADER_FRAGMENT) key_size = sizeof(shader_key->fs); - else if (uncompiled->type == PIPE_SHADER_COMPUTE || - uncompiled->type == PIPE_SHADER_TESS_CTRL) - key_size = 0; else - unreachable("Unsupported shader stage"); + key_size = 0; memcpy(data, uncompiled->nir_sha1, hash_size); diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c index aaceaf7bd69..4de075a4056 100644 --- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c +++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c @@ -199,6 +199,8 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr, return load_sysval_root(b, 1, 16, &u->no_epilog_discard); case nir_intrinsic_load_clip_z_coeff_agx: return nir_f2f32(b, load_sysval_root(b, 1, 16, &u->clip_z_coeff)); + case nir_intrinsic_load_rasterization_stream: + return nir_imm_int(b, 0); case nir_intrinsic_load_depth_never_agx: /* TODO: Do we need this workaround for anything in GL? */ return nir_imm_intN_t(b, 0, 16); diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 910a64639c4..b8df7281fbc 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -1386,7 +1386,6 @@ agx_bind_vertex_elements_state(struct pipe_context *pctx, void *cso) } DERIVE_HASH_TABLE(asahi_vs_shader_key); -DERIVE_HASH_TABLE(asahi_gs_shader_key); DERIVE_HASH_TABLE(asahi_fs_shader_key); DERIVE_HASH_TABLE(agx_fast_link_key); @@ -1593,10 +1592,8 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { NIR_PASS(_, nir, agx_nir_lower_tcs); } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { - struct asahi_gs_shader_key *key = &key_->gs; - - NIR_PASS(_, nir, agx_nir_lower_gs, key->rasterizer_discard, &gs_count, - &gs_copy, &pre_gs, &gs_info); + NIR_PASS(_, nir, agx_nir_lower_gs, &gs_count, &gs_copy, &pre_gs, + &gs_info); } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { struct asahi_fs_shader_key *key = &key_->fs; @@ -1724,11 +1721,7 @@ agx_get_shader_variant(struct agx_screen *screen, struct pipe_context *pctx, } else if (so->type == PIPE_SHADER_VERTEX || so->type == PIPE_SHADER_TESS_EVAL) { memcpy(cloned_key, key, sizeof(struct asahi_vs_shader_key)); - } else if (so->type == PIPE_SHADER_GEOMETRY) { - memcpy(cloned_key, key, sizeof(struct asahi_gs_shader_key)); } else { - assert(gl_shader_stage_is_compute(so->type) || - so->type == PIPE_SHADER_TESS_CTRL); /* No key */ } @@ -1918,9 +1911,8 @@ agx_create_shader_state(struct pipe_context *pctx, nir->info.stage == MESA_SHADER_TESS_EVAL) { so->variants = asahi_vs_shader_key_table_create(so); so->linked_shaders = agx_fast_link_key_table_create(so); - } else if (nir->info.stage == MESA_SHADER_GEOMETRY) { - so->variants = asahi_gs_shader_key_table_create(so); - } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { + } else if (nir->info.stage == MESA_SHADER_TESS_CTRL || + nir->info.stage == MESA_SHADER_GEOMETRY) { /* No variants */ so->variants = _mesa_hash_table_create(NULL, asahi_cs_shader_key_hash, asahi_cs_shader_key_equal); @@ -1958,6 +1950,7 @@ agx_create_shader_state(struct pipe_context *pctx, * acceptable for now. */ if ((so->type == PIPE_SHADER_TESS_CTRL) || + (so->type == PIPE_SHADER_GEOMETRY) || (so->type == PIPE_SHADER_FRAGMENT && !so->info.uses_fbfetch)) { union asahi_shader_key key = {0}; agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &key); @@ -1975,9 +1968,6 @@ agx_create_shader_state(struct pipe_context *pctx, union asahi_shader_key key = {0}; switch (so->type) { - case PIPE_SHADER_GEOMETRY: - break; - case PIPE_SHADER_TESS_EVAL: /* TODO: Tessellation shaders with shader-db */ return so; @@ -2256,12 +2246,10 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info, tgt->stride = gs->xfb_strides[i]; } - struct asahi_gs_shader_key key = { - .rasterizer_discard = ctx->rast->base.rasterizer_discard, - }; - - return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY, - (union asahi_shader_key *)&key); + ctx->gs = _mesa_hash_table_next_entry( + ctx->stage[PIPE_SHADER_GEOMETRY].shader->variants, NULL) + ->data; + return true; } static enum pipe_blendfactor @@ -5147,9 +5135,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, /* Launch the pre-rasterization parts of the geometry shader */ agx_launch_gs_prerast(batch, info, draws, indirect); - if (ctx->rast->base.rasterizer_discard) - return; - /* Setup to rasterize the GS results */ struct agx_gs_info *gsi = &ctx->gs->gs; info_gs = (struct pipe_draw_info){ @@ -5271,6 +5256,14 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, out = (void *)agx_vdm_draw((uint32_t *)out, 0 /* ignored for now */, draw, agx_primitive_for_pipe(info->mode)); + /* Barrier transform feedback writes on themselves for consistency. + * This is the other half of agx_legalize_xfb. + */ + if (ctx->gs && ctx->streamout.num_targets > 0) { + struct agx_device *dev = agx_device(ctx->base.screen); + out = (void *)agx_vdm_barrier((uint32_t *)out, dev->chip); + } + batch->vdm.current = out; assert((batch->vdm.current + AGX_VDM_STREAM_LINK_LENGTH) <= batch->vdm.end && "Failed to reserve sufficient space in encoder"); diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index 5ae930d4af4..057d7036e2e 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -514,16 +514,8 @@ struct asahi_fs_shader_key { }; static_assert(sizeof(struct asahi_fs_shader_key) == 40, "no holes"); -struct asahi_gs_shader_key { - /* If true, this GS is run only for its side effects (including XFB) */ - bool rasterizer_discard; - bool padding[7]; -}; -static_assert(sizeof(struct asahi_gs_shader_key) == 8, "no holes"); - union asahi_shader_key { struct asahi_vs_shader_key vs; - struct asahi_gs_shader_key gs; struct asahi_fs_shader_key fs; };