mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-24 21:10:24 +01:00
asahi: invert geometry shaders
instead of dumping GS outputs to memory and using a GS copy vertex shader, invert the GS to get a hardware vertex shader to use directly. theoretically, reduces mem bandwidth at the cost of repeated work if we don't optimze well. perhaps more importantly, it should reduce heap usage. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
This commit is contained in:
parent
1ac3e48fcc
commit
86782156e7
4 changed files with 328 additions and 163 deletions
|
|
@ -10,6 +10,7 @@
|
|||
#include "gallium/include/pipe/p_defines.h"
|
||||
#include "shaders/geometry.h"
|
||||
#include "util/bitscan.h"
|
||||
#include "util/list.h"
|
||||
#include "util/macros.h"
|
||||
#include "util/ralloc.h"
|
||||
#include "util/u_math.h"
|
||||
|
|
@ -17,9 +18,16 @@
|
|||
#include "nir.h"
|
||||
#include "nir_builder_opcodes.h"
|
||||
#include "nir_intrinsics.h"
|
||||
#include "nir_intrinsics_indices.h"
|
||||
#include "nir_xfb_info.h"
|
||||
#include "shader_enums.h"
|
||||
|
||||
/* Marks a transform feedback store, which must not be stripped from the
|
||||
* prepass since that's where the transform feedback happens. Chosen as a
|
||||
* vendored flag not to alias other flags we'll see.
|
||||
*/
|
||||
#define ACCESS_XFB (ACCESS_IS_SWIZZLED_AMD)
|
||||
|
||||
enum gs_counter {
|
||||
GS_COUNTER_VERTICES = 0,
|
||||
GS_COUNTER_PRIMITIVES,
|
||||
|
|
@ -33,10 +41,6 @@ struct lower_gs_state {
|
|||
int static_count[GS_NUM_COUNTERS][MAX_VERTEX_STREAMS];
|
||||
nir_variable *outputs[NUM_TOTAL_VARYING_SLOTS][MAX_PRIM_OUT_SIZE];
|
||||
|
||||
/* For the geometry output buffer */
|
||||
unsigned stride_B;
|
||||
unsigned offset_B[NUM_TOTAL_VARYING_SLOTS];
|
||||
|
||||
/* The count buffer contains `count_stride_el` 32-bit words in a row for each
|
||||
* input primitive, for `input_primitives * count_stride_el * 4` total bytes.
|
||||
*/
|
||||
|
|
@ -52,21 +56,6 @@ struct lower_gs_state {
|
|||
bool rasterizer_discard;
|
||||
};
|
||||
|
||||
static uint64_t
|
||||
outputs_rasterized(nir_shader *s)
|
||||
{
|
||||
uint64_t outputs = s->info.outputs_written;
|
||||
|
||||
/* Optimize out pointless gl_PointSize outputs. Bizarrely, these occur. We
|
||||
* need to preserve the transform feedback portion of the write, but we don't
|
||||
* bother saving for rasterization.
|
||||
*/
|
||||
if (s->info.gs.output_primitive != MESA_PRIM_POINTS)
|
||||
outputs &= ~VARYING_BIT_PSIZ;
|
||||
|
||||
return outputs;
|
||||
}
|
||||
|
||||
/* Helpers for loading from the geometry state buffer */
|
||||
static nir_def *
|
||||
load_geometry_param_offset(nir_builder *b, uint32_t offset, uint8_t bytes)
|
||||
|
|
@ -131,18 +120,11 @@ add_counter(nir_builder *b, nir_def *counter, nir_def *increment)
|
|||
}
|
||||
|
||||
/* Helpers for lowering I/O to variables */
|
||||
bool
|
||||
agx_lower_output_to_var(nir_builder *b, nir_instr *instr, void *data)
|
||||
static void
|
||||
lower_store_to_var(nir_builder *b, nir_intrinsic_instr *intr,
|
||||
struct agx_lower_output_to_var_state *state)
|
||||
{
|
||||
struct agx_lower_output_to_var_state *state = data;
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
if (intr->intrinsic != nir_intrinsic_store_output)
|
||||
return false;
|
||||
|
||||
b->cursor = nir_instr_remove(instr);
|
||||
b->cursor = nir_instr_remove(&intr->instr);
|
||||
nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
|
||||
unsigned component = nir_intrinsic_component(intr);
|
||||
nir_def *value = intr->src[0].ssa;
|
||||
|
|
@ -156,7 +138,7 @@ agx_lower_output_to_var(nir_builder *b, nir_instr *instr, void *data)
|
|||
if (!var) {
|
||||
assert(sem.location == VARYING_SLOT_PSIZ &&
|
||||
"otherwise in outputs_written");
|
||||
return true;
|
||||
return;
|
||||
}
|
||||
|
||||
unsigned nr_components = glsl_get_components(glsl_without_array(var->type));
|
||||
|
|
@ -167,6 +149,19 @@ agx_lower_output_to_var(nir_builder *b, nir_instr *instr, void *data)
|
|||
component);
|
||||
|
||||
nir_store_var(b, var, value, BITFIELD_BIT(component));
|
||||
}
|
||||
|
||||
bool
|
||||
agx_lower_output_to_var(nir_builder *b, nir_instr *instr, void *data)
|
||||
{
|
||||
if (instr->type != nir_instr_type_intrinsic)
|
||||
return false;
|
||||
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
if (intr->intrinsic != nir_intrinsic_store_output)
|
||||
return false;
|
||||
|
||||
lower_store_to_var(b, intr, data);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -237,6 +232,34 @@ calc_unrolled_id(nir_builder *b)
|
|||
load_primitive_id(b));
|
||||
}
|
||||
|
||||
static unsigned
|
||||
output_vertex_id_stride(nir_shader *gs)
|
||||
{
|
||||
/* round up to power of two for cheap multiply/division */
|
||||
return util_next_power_of_two(MAX2(gs->info.gs.vertices_out, 1));
|
||||
}
|
||||
|
||||
/* Variant of calc_unrolled_id that uses a power-of-two stride for indices. This
|
||||
* is sparser (acceptable for index buffer values, not for count buffer
|
||||
* indices). It has the nice property of being cheap to invert, unlike
|
||||
* calc_unrolled_id. So, we use calc_unrolled_id for count buffers and
|
||||
* calc_unrolled_index_id for index values.
|
||||
*
|
||||
* This also multiplies by the appropriate stride to calculate the final index
|
||||
* base value.
|
||||
*/
|
||||
static nir_def *
|
||||
calc_unrolled_index_id(nir_builder *b)
|
||||
{
|
||||
unsigned vertex_stride = output_vertex_id_stride(b->shader);
|
||||
nir_def *primitives_log2 = load_geometry_param(b, primitives_log2);
|
||||
|
||||
nir_def *instance = nir_ishl(b, load_instance_id(b), primitives_log2);
|
||||
nir_def *prim = nir_iadd(b, instance, load_primitive_id(b));
|
||||
|
||||
return nir_imul_imm(b, prim, vertex_stride);
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
load_count_address(nir_builder *b, struct lower_gs_state *state,
|
||||
nir_def *unrolled_id, unsigned stream,
|
||||
|
|
@ -355,40 +378,153 @@ agx_nir_create_geometry_count_shader(nir_shader *gs, const nir_shader *libagx,
|
|||
return shader;
|
||||
}
|
||||
|
||||
struct lower_gs_rast_state {
|
||||
nir_def *instance_id, *primitive_id, *output_id;
|
||||
struct agx_lower_output_to_var_state outputs;
|
||||
struct agx_lower_output_to_var_state selected;
|
||||
};
|
||||
|
||||
static void
|
||||
select_rast_output(nir_builder *b, nir_intrinsic_instr *intr,
|
||||
struct lower_gs_rast_state *state)
|
||||
{
|
||||
b->cursor = nir_instr_remove(&intr->instr);
|
||||
|
||||
/* We only care about the rasterization stream in the rasterization
|
||||
* shader, so just ignore emits from other streams.
|
||||
*/
|
||||
if (nir_intrinsic_stream_id(intr) != 0)
|
||||
return;
|
||||
|
||||
u_foreach_bit64(slot, b->shader->info.outputs_written) {
|
||||
nir_def *orig = nir_load_var(b, state->selected.outputs[slot]);
|
||||
nir_def *data = nir_load_var(b, state->outputs.outputs[slot]);
|
||||
|
||||
nir_def *value = nir_bcsel(
|
||||
b, nir_ieq(b, intr->src[0].ssa, state->output_id), data, orig);
|
||||
|
||||
nir_store_var(b, state->selected.outputs[slot], value,
|
||||
nir_component_mask(value->num_components));
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data)
|
||||
{
|
||||
struct lower_gs_rast_state *state = data;
|
||||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_store_output:
|
||||
lower_store_to_var(b, intr, &state->outputs);
|
||||
return true;
|
||||
|
||||
case nir_intrinsic_emit_vertex_with_counter:
|
||||
select_rast_output(b, intr, state);
|
||||
return true;
|
||||
|
||||
case nir_intrinsic_load_primitive_id:
|
||||
nir_def_rewrite_uses(&intr->def, state->primitive_id);
|
||||
return true;
|
||||
|
||||
case nir_intrinsic_load_instance_id:
|
||||
nir_def_rewrite_uses(&intr->def, state->instance_id);
|
||||
return true;
|
||||
|
||||
case nir_intrinsic_load_num_vertices: {
|
||||
b->cursor = nir_before_instr(&intr->instr);
|
||||
nir_def_rewrite_uses(&intr->def, load_geometry_param(b, gs_grid[0]));
|
||||
return true;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_flat_mask:
|
||||
case nir_intrinsic_load_provoking_last:
|
||||
/* Lowering the same in both GS variants */
|
||||
return lower_id(b, intr, data);
|
||||
|
||||
case nir_intrinsic_end_primitive_with_counter:
|
||||
case nir_intrinsic_set_vertex_and_primitive_count:
|
||||
nir_instr_remove(&intr->instr);
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a GS copy shader. This is a hardware vertex shader that copies each
|
||||
* vertex from the geometry output buffer to the Unified Vertex Store.
|
||||
* Create a GS rasterization shader. This is a hardware vertex shader that
|
||||
* shades each rasterized output vertex in parallel.
|
||||
*/
|
||||
static nir_shader *
|
||||
agx_nir_create_gs_copy_shader(struct lower_gs_state *state,
|
||||
uint64_t outputs_written,
|
||||
unsigned clip_distance_array_size,
|
||||
unsigned cull_distance_array_size,
|
||||
enum mesa_prim output_primitive)
|
||||
agx_nir_create_gs_rast_shader(const nir_shader *gs, const nir_shader *libagx)
|
||||
{
|
||||
nir_builder b_ = nir_builder_init_simple_shader(MESA_SHADER_VERTEX,
|
||||
&agx_nir_options, "GS copy");
|
||||
/* Don't muck up the original shader */
|
||||
nir_shader *shader = nir_shader_clone(NULL, gs);
|
||||
|
||||
unsigned max_verts = output_vertex_id_stride(shader);
|
||||
|
||||
/* Turn into a vertex shader run only for rasterization. Transform feedback
|
||||
* was handled in the prepass.
|
||||
*/
|
||||
shader->info.stage = MESA_SHADER_VERTEX;
|
||||
shader->info.has_transform_feedback_varyings = false;
|
||||
memset(&shader->info.vs, 0, sizeof(shader->info.vs));
|
||||
shader->xfb_info = NULL;
|
||||
|
||||
if (shader->info.name) {
|
||||
shader->info.name = ralloc_asprintf(shader, "%s_rast", shader->info.name);
|
||||
} else {
|
||||
shader->info.name = "gs rast";
|
||||
}
|
||||
|
||||
nir_builder b_ =
|
||||
nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(shader)));
|
||||
nir_builder *b = &b_;
|
||||
|
||||
b->shader->info.clip_distance_array_size = clip_distance_array_size;
|
||||
b->shader->info.cull_distance_array_size = cull_distance_array_size;
|
||||
/* Optimize out pointless gl_PointSize outputs. Bizarrely, these occur. */
|
||||
if (shader->info.gs.output_primitive != MESA_PRIM_POINTS)
|
||||
shader->info.outputs_written &= ~VARYING_BIT_PSIZ;
|
||||
|
||||
/* Get the base for this vertex */
|
||||
nir_def *vert_offs = nir_imul_imm(b, nir_load_vertex_id(b), state->stride_B);
|
||||
/* See calc_unrolled_index_id */
|
||||
nir_def *raw_id = nir_load_vertex_id(b);
|
||||
nir_def *output_id = nir_umod_imm(b, raw_id, max_verts);
|
||||
nir_def *unrolled = nir_udiv_imm(b, raw_id, max_verts);
|
||||
|
||||
nir_def *state_buffer = load_geometry_param(b, output_buffer);
|
||||
nir_def *primitives_log2 = load_geometry_param(b, primitives_log2);
|
||||
nir_def *instance_id = nir_ushr(b, unrolled, primitives_log2);
|
||||
nir_def *primitive_id = nir_iand(
|
||||
b, unrolled,
|
||||
nir_iadd_imm(b, nir_ishl(b, nir_imm_int(b, 1), primitives_log2), -1));
|
||||
|
||||
/* Each output must be copied */
|
||||
u_foreach_bit64(slot, outputs_written) {
|
||||
assert(state->outputs[slot][0] != NULL);
|
||||
struct lower_gs_rast_state rast_state = {
|
||||
.instance_id = instance_id,
|
||||
.primitive_id = primitive_id,
|
||||
.output_id = output_id,
|
||||
};
|
||||
|
||||
nir_def *addr = nir_iadd(
|
||||
b, state_buffer,
|
||||
nir_u2u64(b, nir_iadd_imm(b, vert_offs, state->offset_B[slot])));
|
||||
u_foreach_bit64(slot, shader->info.outputs_written) {
|
||||
const char *slot_name =
|
||||
gl_varying_slot_name_for_stage(slot, MESA_SHADER_GEOMETRY);
|
||||
|
||||
unsigned components = glsl_get_components(state->outputs[slot][0]->type);
|
||||
rast_state.outputs.outputs[slot] = nir_variable_create(
|
||||
shader, nir_var_shader_temp, glsl_vector_type(GLSL_TYPE_UINT, 4),
|
||||
ralloc_asprintf(shader, "%s-temp", slot_name));
|
||||
|
||||
nir_def *value = nir_load_global_constant(b, addr, 4, components, 32);
|
||||
rast_state.selected.outputs[slot] = nir_variable_create(
|
||||
shader, nir_var_shader_temp, glsl_vector_type(GLSL_TYPE_UINT, 4),
|
||||
ralloc_asprintf(shader, "%s-selected", slot_name));
|
||||
}
|
||||
|
||||
nir_shader_intrinsics_pass(shader, lower_to_gs_rast,
|
||||
nir_metadata_block_index | nir_metadata_dominance,
|
||||
&rast_state);
|
||||
|
||||
b->cursor = nir_after_impl(b->impl);
|
||||
|
||||
/* Forward each selected output to the rasterizer */
|
||||
u_foreach_bit64(slot, shader->info.outputs_written) {
|
||||
assert(rast_state.selected.outputs[slot] != NULL);
|
||||
nir_def *value = nir_load_var(b, rast_state.selected.outputs[slot]);
|
||||
|
||||
/* We set NIR_COMPACT_ARRAYS so clip/cull distance needs to come all in
|
||||
* DIST0. Undo the offset if we need to.
|
||||
|
|
@ -400,9 +536,7 @@ agx_nir_create_gs_copy_shader(struct lower_gs_state *state,
|
|||
nir_store_output(b, value, nir_imm_int(b, offset),
|
||||
.io_semantics.location = slot - offset,
|
||||
.io_semantics.num_slots = 1,
|
||||
.write_mask = nir_component_mask(components));
|
||||
|
||||
b->shader->info.outputs_written |= BITFIELD64_BIT(slot);
|
||||
.write_mask = nir_component_mask(value->num_components));
|
||||
}
|
||||
|
||||
/* In OpenGL ES, it is legal to omit the point size write from the geometry
|
||||
|
|
@ -413,21 +547,24 @@ agx_nir_create_gs_copy_shader(struct lower_gs_state *state,
|
|||
*
|
||||
* This should not be load bearing for other APIs, but should be harmless.
|
||||
*/
|
||||
bool is_points = output_primitive == MESA_PRIM_POINTS;
|
||||
bool is_points = gs->info.gs.output_primitive == MESA_PRIM_POINTS;
|
||||
|
||||
if (!(outputs_written & VARYING_BIT_PSIZ) && is_points) {
|
||||
if (!(shader->info.outputs_written & VARYING_BIT_PSIZ) && is_points) {
|
||||
nir_store_output(b, nir_imm_float(b, 1.0), nir_imm_int(b, 0),
|
||||
.io_semantics.location = VARYING_SLOT_PSIZ,
|
||||
.io_semantics.num_slots = 1,
|
||||
.write_mask = nir_component_mask(1));
|
||||
|
||||
b->shader->info.outputs_written |= VARYING_BIT_PSIZ;
|
||||
shader->info.outputs_written |= VARYING_BIT_PSIZ;
|
||||
}
|
||||
|
||||
UNUSED struct agx_uncompiled_shader_info info;
|
||||
agx_preprocess_nir(b->shader, NULL, false, &info);
|
||||
nir_opt_idiv_const(shader, 16);
|
||||
|
||||
return b->shader;
|
||||
/* Preprocess it */
|
||||
UNUSED struct agx_uncompiled_shader_info info;
|
||||
agx_preprocess_nir(shader, libagx, false, &info);
|
||||
|
||||
return shader;
|
||||
}
|
||||
|
||||
static nir_def *
|
||||
|
|
@ -486,8 +623,9 @@ static void
|
|||
lower_end_primitive(nir_builder *b, nir_intrinsic_instr *intr,
|
||||
struct lower_gs_state *state)
|
||||
{
|
||||
assert(b->shader->info.gs.output_primitive != MESA_PRIM_POINTS &&
|
||||
"should've been removed");
|
||||
assert((intr->intrinsic == nir_intrinsic_set_vertex_and_primitive_count ||
|
||||
b->shader->info.gs.output_primitive != MESA_PRIM_POINTS) &&
|
||||
"endprimitive for points should've been removed");
|
||||
|
||||
/* The GS is the last stage before rasterization, so if we discard the
|
||||
* rasterization, we don't output an index buffer, nothing will read it.
|
||||
|
|
@ -497,10 +635,13 @@ lower_end_primitive(nir_builder *b, nir_intrinsic_instr *intr,
|
|||
if (state->rasterizer_discard || stream != 0)
|
||||
return;
|
||||
|
||||
libagx_end_primitive(b, load_geometry_param(b, output_index_buffer),
|
||||
intr->src[0].ssa, intr->src[1].ssa, intr->src[2].ssa,
|
||||
previous_vertices(b, state, 0, calc_unrolled_id(b)),
|
||||
previous_primitives(b, state, 0, calc_unrolled_id(b)));
|
||||
libagx_end_primitive(
|
||||
b, load_geometry_param(b, output_index_buffer), intr->src[0].ssa,
|
||||
intr->src[1].ssa, intr->src[2].ssa,
|
||||
previous_vertices(b, state, 0, calc_unrolled_id(b)),
|
||||
previous_primitives(b, state, 0, calc_unrolled_id(b)),
|
||||
calc_unrolled_index_id(b),
|
||||
nir_imm_bool(b, b->shader->info.gs.output_primitive != MESA_PRIM_POINTS));
|
||||
}
|
||||
|
||||
static unsigned
|
||||
|
|
@ -582,9 +723,10 @@ write_xfb(nir_builder *b, struct lower_gs_state *state, unsigned stream,
|
|||
nir_imm_int(b, buffer), nir_imm_int(b, stride),
|
||||
nir_imm_int(b, output.offset));
|
||||
|
||||
nir_store_global(b, addr, 4,
|
||||
nir_channels(b, value, output.component_mask),
|
||||
nir_component_mask(count));
|
||||
nir_build_store_global(
|
||||
b, nir_channels(b, value, output.component_mask), addr,
|
||||
.align_mul = 4, .write_mask = nir_component_mask(count),
|
||||
.access = ACCESS_XFB);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -599,52 +741,6 @@ static void
|
|||
lower_emit_vertex(nir_builder *b, nir_intrinsic_instr *intr,
|
||||
struct lower_gs_state *state)
|
||||
{
|
||||
nir_def *total_vertices = intr->src[0].ssa;
|
||||
|
||||
/* All previous invocations are first in the geometry output buffer */
|
||||
unsigned stream = nir_intrinsic_stream_id(intr);
|
||||
nir_def *invocation_vertex_base =
|
||||
previous_vertices(b, state, stream, calc_unrolled_id(b));
|
||||
|
||||
/* Calculate the number of vertices that this invocation will produce. This
|
||||
* is calculated by the count shader and then prefix summed, so calculate the
|
||||
* difference to undo the action of the prefix sum.
|
||||
*/
|
||||
nir_def *next = previous_vertices(b, state, stream,
|
||||
nir_iadd_imm(b, calc_unrolled_id(b), 1));
|
||||
nir_def *our_num_verts = nir_isub(b, next, invocation_vertex_base);
|
||||
|
||||
/* We can only emit vertices within bounds, since other entries in the
|
||||
* geometry state buffer might belong to other invocations. This is required
|
||||
* to pass glsl-1.50-geometry-end-primitive (without geometry shaders racing
|
||||
* each other).
|
||||
*
|
||||
* TODO: This could be optimized many ways.
|
||||
*/
|
||||
if (!state->rasterizer_discard && stream == 0) {
|
||||
nir_if *nif = nir_push_if(b, nir_ult(b, total_vertices, our_num_verts));
|
||||
{
|
||||
/* The index into the geometry output buffer */
|
||||
nir_def *vertex_id =
|
||||
nir_iadd(b, invocation_vertex_base, total_vertices);
|
||||
|
||||
nir_def *buffer = load_geometry_param(b, output_buffer);
|
||||
nir_def *vertex_offset = nir_imul_imm(b, vertex_id, state->stride_B);
|
||||
nir_def *vertex_addr =
|
||||
nir_iadd(b, buffer, nir_u2u64(b, vertex_offset));
|
||||
|
||||
/* Copy each output where it belongs */
|
||||
u_foreach_bit64(slot, outputs_rasterized(b->shader)) {
|
||||
nir_def *addr = nir_iadd_imm(b, vertex_addr, state->offset_B[slot]);
|
||||
nir_def *value = nir_load_var(b, state->outputs[slot][0]);
|
||||
unsigned comps = glsl_get_components(state->outputs[slot][0]->type);
|
||||
|
||||
nir_store_global(b, addr, 4, value, nir_component_mask(comps));
|
||||
}
|
||||
}
|
||||
nir_pop_if(b, nif);
|
||||
}
|
||||
|
||||
/* Transform feedback is written for each decomposed output primitive. Since
|
||||
* we're writing strips, that means we output XFB for each vertex after the
|
||||
* first complete primitive is formed.
|
||||
|
|
@ -697,7 +793,13 @@ lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state)
|
|||
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_set_vertex_and_primitive_count:
|
||||
/* This instruction is only for the count shader, so just remove */
|
||||
/* This instruction is mostly for the count shader, so just remove. But
|
||||
* for points, we write the index buffer here so the rast shader can map.
|
||||
*/
|
||||
if (b->shader->info.gs.output_primitive == MESA_PRIM_POINTS) {
|
||||
lower_end_primitive(b, intr, state);
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case nir_intrinsic_end_primitive_with_counter: {
|
||||
|
|
@ -750,7 +852,7 @@ collect_components(nir_builder *b, nir_intrinsic_instr *intr, void *data)
|
|||
*/
|
||||
static nir_shader *
|
||||
agx_nir_create_pre_gs(struct lower_gs_state *state, const nir_shader *libagx,
|
||||
bool indexed, struct nir_xfb_info *xfb,
|
||||
bool indexed, bool restart, struct nir_xfb_info *xfb,
|
||||
unsigned vertices_per_prim, uint8_t streams,
|
||||
unsigned invocations)
|
||||
{
|
||||
|
|
@ -763,11 +865,11 @@ agx_nir_create_pre_gs(struct lower_gs_state *state, const nir_shader *libagx,
|
|||
|
||||
/* Setup the draw from the rasterization stream (0). */
|
||||
if (!state->rasterizer_discard) {
|
||||
libagx_build_gs_draw(b, nir_load_geometry_param_buffer_agx(b),
|
||||
nir_imm_bool(b, indexed),
|
||||
previous_vertices(b, state, 0, unrolled_in_prims),
|
||||
previous_primitives(b, state, 0, unrolled_in_prims),
|
||||
nir_imm_int(b, state->stride_B));
|
||||
libagx_build_gs_draw(
|
||||
b, nir_load_geometry_param_buffer_agx(b), nir_imm_bool(b, indexed),
|
||||
previous_vertices(b, state, 0, unrolled_in_prims),
|
||||
restart ? previous_primitives(b, state, 0, unrolled_in_prims)
|
||||
: nir_imm_int(b, 0));
|
||||
}
|
||||
|
||||
/* Determine the number of primitives generated in each stream */
|
||||
|
|
@ -984,6 +1086,39 @@ agx_nir_lower_gs_instancing(nir_shader *gs)
|
|||
index);
|
||||
}
|
||||
|
||||
static bool
|
||||
strip_side_effects(nir_builder *b, nir_intrinsic_instr *intr, void *_)
|
||||
{
|
||||
switch (intr->intrinsic) {
|
||||
case nir_intrinsic_store_global:
|
||||
case nir_intrinsic_global_atomic:
|
||||
case nir_intrinsic_global_atomic_swap:
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
/* If there's a side effect that's actually required for the prepass, we have
|
||||
* to keep it in.
|
||||
*/
|
||||
if (nir_intrinsic_infos[intr->intrinsic].has_dest &&
|
||||
!list_is_empty(&intr->def.uses))
|
||||
return false;
|
||||
|
||||
/* Do not strip transform feedback stores, the rasterization shader doesn't
|
||||
* execute them.
|
||||
*/
|
||||
if (intr->intrinsic == nir_intrinsic_store_global &&
|
||||
nir_intrinsic_access(intr) & ACCESS_XFB)
|
||||
return false;
|
||||
|
||||
/* Otherwise, remove the dead instruction. The rasterization shader will
|
||||
* execute the side effect so the side effect still happens at least once.
|
||||
*/
|
||||
nir_instr_remove(&intr->instr);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void
|
||||
link_libagx(nir_shader *nir, const nir_shader *libagx)
|
||||
{
|
||||
|
|
@ -1068,22 +1203,10 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx,
|
|||
NIR_PASS(progress, gs, nir_opt_loop_unroll);
|
||||
} while (progress);
|
||||
|
||||
NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id,
|
||||
nir_metadata_block_index | nir_metadata_dominance, NULL);
|
||||
|
||||
link_libagx(gs, libagx);
|
||||
|
||||
NIR_PASS(_, gs, nir_lower_idiv,
|
||||
&(const nir_lower_idiv_options){.allow_fp16 = true});
|
||||
|
||||
/* All those variables we created should've gone away by now */
|
||||
NIR_PASS(_, gs, nir_remove_dead_variables, nir_var_function_temp, NULL);
|
||||
|
||||
/* If we know counts at compile-time we can simplify, so try to figure out
|
||||
* the counts statically.
|
||||
*/
|
||||
struct lower_gs_state gs_state = {
|
||||
.stride_B = 0,
|
||||
.rasterizer_discard = rasterizer_discard,
|
||||
};
|
||||
|
||||
|
|
@ -1102,6 +1225,19 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx,
|
|||
}
|
||||
}
|
||||
|
||||
*gs_copy = agx_nir_create_gs_rast_shader(gs, libagx);
|
||||
|
||||
NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id,
|
||||
nir_metadata_block_index | nir_metadata_dominance, NULL);
|
||||
|
||||
link_libagx(gs, libagx);
|
||||
|
||||
NIR_PASS(_, gs, nir_lower_idiv,
|
||||
&(const nir_lower_idiv_options){.allow_fp16 = true});
|
||||
|
||||
/* All those variables we created should've gone away by now */
|
||||
NIR_PASS(_, gs, nir_remove_dead_variables, nir_var_function_temp, NULL);
|
||||
|
||||
/* If there is any unknown count, we need a geometry count shader */
|
||||
if (gs_state.count_stride_el > 0)
|
||||
*gs_count = agx_nir_create_geometry_count_shader(gs, libagx, &gs_state);
|
||||
|
|
@ -1123,11 +1259,6 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx,
|
|||
}
|
||||
|
||||
state.outputs[slot] = gs_state.outputs[slot][0];
|
||||
|
||||
/* Assume fp32 output */
|
||||
unsigned size_B = 4 * component_counts[slot];
|
||||
gs_state.offset_B[slot] = gs_state.stride_B;
|
||||
gs_state.stride_B += size_B;
|
||||
}
|
||||
|
||||
NIR_PASS(_, gs, nir_shader_instructions_pass, agx_lower_output_to_var,
|
||||
|
|
@ -1136,6 +1267,13 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx,
|
|||
NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_instr,
|
||||
nir_metadata_none, &gs_state);
|
||||
|
||||
/* Determine if we are guaranteed to rasterize at least one vertex, so that
|
||||
* we can strip the prepass of side effects knowing they will execute in the
|
||||
* rasterization shader.
|
||||
*/
|
||||
bool rasterizes_at_least_one_vertex =
|
||||
!rasterizer_discard && gs_state.static_count[0][0] > 0;
|
||||
|
||||
/* Clean up after all that lowering we did */
|
||||
nir_lower_global_vars_to_local(gs);
|
||||
do {
|
||||
|
|
@ -1151,6 +1289,15 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx,
|
|||
NIR_PASS(progress, gs, nir_opt_dead_cf);
|
||||
NIR_PASS(progress, gs, nir_opt_dce);
|
||||
NIR_PASS(progress, gs, nir_opt_loop_unroll);
|
||||
|
||||
/* When rasterizing, we try to move side effects to the rasterizer shader
|
||||
* and strip the prepass of the dead side effects. Run this in the opt
|
||||
* loop because it interacts with nir_opt_dce.
|
||||
*/
|
||||
if (rasterizes_at_least_one_vertex) {
|
||||
NIR_PASS(progress, gs, nir_shader_intrinsics_pass, strip_side_effects,
|
||||
nir_metadata_block_index | nir_metadata_dominance, NULL);
|
||||
}
|
||||
} while (progress);
|
||||
|
||||
/* All those variables we created should've gone away by now */
|
||||
|
|
@ -1162,12 +1309,8 @@ agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx,
|
|||
nir_metadata_block_index | nir_metadata_dominance, NULL);
|
||||
|
||||
/* Create auxiliary programs */
|
||||
*gs_copy = agx_nir_create_gs_copy_shader(
|
||||
&gs_state, outputs_rasterized(gs), gs->info.clip_distance_array_size,
|
||||
gs->info.cull_distance_array_size, gs->info.gs.output_primitive);
|
||||
|
||||
*pre_gs = agx_nir_create_pre_gs(
|
||||
&gs_state, libagx, gs->info.gs.output_primitive != MESA_PRIM_POINTS,
|
||||
&gs_state, libagx, true, gs->info.gs.output_primitive != MESA_PRIM_POINTS,
|
||||
gs->xfb_info, verts_in_output_prim(gs), gs->info.gs.active_stream_mask,
|
||||
gs->info.gs.invocations);
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,16 @@ align(uint x, uint y)
|
|||
return (x + y - 1) & ~(y - 1);
|
||||
}
|
||||
|
||||
/* Compatible with util/u_math.h */
|
||||
static inline uint
|
||||
util_logbase2_ceil(uint n)
|
||||
{
|
||||
if (n <= 1)
|
||||
return 0;
|
||||
else
|
||||
return 32 - clz(n - 1);
|
||||
}
|
||||
|
||||
/* Swap the two non-provoking vertices third vert in odd triangles. This
|
||||
* generates a vertex ID list with a consistent winding order.
|
||||
*
|
||||
|
|
@ -279,41 +289,46 @@ libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i)
|
|||
* b + 2, ..., b + n - 1, -1), where b (base) is the first vertex in the prim, n
|
||||
* (count) is the number of verts in the prims, and -1 is the prim restart index
|
||||
* used to signal the end of the prim.
|
||||
*
|
||||
* For points, we write index buffers without restart, just as a sideband to
|
||||
* pass data into the vertex shader.
|
||||
*/
|
||||
void
|
||||
libagx_end_primitive(global int *index_buffer, uint total_verts,
|
||||
uint verts_in_prim, uint total_prims,
|
||||
uint invocation_vertex_base, uint invocation_prim_base)
|
||||
uint invocation_vertex_base, uint invocation_prim_base,
|
||||
uint geometry_base, bool restart)
|
||||
{
|
||||
/* Previous verts/prims are from previous invocations plus earlier
|
||||
* prims in this invocation. For the intra-invocation counts, we
|
||||
* subtract the count for this prim from the inclusive sum NIR gives us.
|
||||
*/
|
||||
uint previous_verts = invocation_vertex_base + (total_verts - verts_in_prim);
|
||||
uint previous_prims = invocation_prim_base + (total_prims - 1);
|
||||
uint previous_verts_in_invoc = (total_verts - verts_in_prim);
|
||||
uint previous_verts = invocation_vertex_base + previous_verts_in_invoc;
|
||||
uint previous_prims = restart ? invocation_prim_base + (total_prims - 1) : 0;
|
||||
|
||||
/* The indices are encoded as: (unrolled ID * output vertices) + vertex. */
|
||||
uint index_base = geometry_base + previous_verts_in_invoc;
|
||||
|
||||
/* Index buffer contains 1 index for each vertex and 1 for each prim */
|
||||
global int *out = &index_buffer[previous_verts + previous_prims];
|
||||
|
||||
/* Write out indices for the strip */
|
||||
for (uint i = 0; i < verts_in_prim; ++i) {
|
||||
out[i] = previous_verts + i;
|
||||
out[i] = index_base + i;
|
||||
}
|
||||
|
||||
out[verts_in_prim] = -1;
|
||||
if (restart)
|
||||
out[verts_in_prim] = -1;
|
||||
}
|
||||
|
||||
void
|
||||
libagx_build_gs_draw(global struct agx_geometry_params *p, bool indexed,
|
||||
uint vertices, uint primitives, uint output_stride_B)
|
||||
uint vertices, uint primitives)
|
||||
{
|
||||
global uint *descriptor = p->indirect_desc;
|
||||
global struct agx_geometry_state *state = p->state;
|
||||
|
||||
/* Allocate the output buffer (per vertex) */
|
||||
p->output_buffer = (global uint *)(state->heap + state->heap_bottom);
|
||||
state->heap_bottom += align(vertices * output_stride_B, 4);
|
||||
|
||||
/* Setup the indirect draw descriptor */
|
||||
if (indexed) {
|
||||
uint indices = vertices + primitives; /* includes restart indices */
|
||||
|
|
@ -367,6 +382,8 @@ libagx_gs_setup_indirect(global struct agx_geometry_params *p,
|
|||
p->gs_grid[1] = instance_count;
|
||||
p->gs_grid[2] = 1;
|
||||
|
||||
p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
|
||||
|
||||
/* If indexing is enabled, the third word is the offset into the index buffer
|
||||
* in elements. Apply that offset now that we have it. For a hardware
|
||||
* indirect draw, the hardware would do this for us, but for software input
|
||||
|
|
|
|||
|
|
@ -113,9 +113,6 @@ struct agx_geometry_params {
|
|||
/* Pointers to transform feedback buffer offsets in bytes */
|
||||
GLOBAL(uint) xfb_offs_ptrs[MAX_SO_BUFFERS];
|
||||
|
||||
/* Output (vertex) buffer, allocated by pre-GS. */
|
||||
GLOBAL(uint) output_buffer;
|
||||
|
||||
/* Output index buffer, allocated by pre-GS. */
|
||||
GLOBAL(uint) output_index_buffer;
|
||||
|
||||
|
|
@ -149,18 +146,24 @@ struct agx_geometry_params {
|
|||
/* Number of input vertices, part of the stride for the vertex buffer */
|
||||
uint32_t input_vertices;
|
||||
|
||||
/* Number of input primitives, calculated by the CPU for a direct draw or the
|
||||
* GS indirect setup kernel for an indirect draw.
|
||||
/* Number of input primitives across all instances, calculated by the CPU for
|
||||
* a direct draw or the GS indirect setup kernel for an indirect draw.
|
||||
*/
|
||||
uint32_t input_primitives;
|
||||
|
||||
/* Number of input primitives per instance, rounded up to a power-of-two and
|
||||
* with the base-2 log taken. This is used to partition the output vertex IDs
|
||||
* efficiently.
|
||||
*/
|
||||
uint32_t primitives_log2;
|
||||
|
||||
/* Number of bytes output by the GS count shader per input primitive (may be
|
||||
* 0), written by CPU and consumed by indirect draw setup shader for
|
||||
* allocating counts.
|
||||
*/
|
||||
uint32_t count_buffer_stride;
|
||||
} PACKED;
|
||||
AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 83 * 4);
|
||||
AGX_STATIC_ASSERT(sizeof(struct agx_geometry_params) == 82 * 4);
|
||||
|
||||
struct agx_tess_params {
|
||||
/* Persistent (cross-draw) geometry state */
|
||||
|
|
|
|||
|
|
@ -4178,6 +4178,8 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
|
|||
params.gs_grid[0] =
|
||||
u_decomposed_prims_for_vertices(info->mode, draw->count);
|
||||
|
||||
params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
|
||||
|
||||
params.input_primitives = params.gs_grid[0] * info->instance_count;
|
||||
params.input_vertices = draw->count;
|
||||
|
||||
|
|
@ -5066,7 +5068,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
|
|||
/* Setup to rasterize the GS results */
|
||||
info_gs = (struct pipe_draw_info){
|
||||
.mode = ctx->gs->gs_output_mode,
|
||||
.index_size = ctx->gs->gs_output_mode != MESA_PRIM_POINTS ? 4 : 0,
|
||||
.index_size = 4,
|
||||
.primitive_restart = ctx->gs->gs_output_mode != MESA_PRIM_POINTS,
|
||||
.restart_index = ~0,
|
||||
.index.resource = ctx->heap,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue