mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-05 07:28:11 +02:00
ac/nir/lower_ngg: add & use new scalar helpers for GS loads/stores
This simplifies the code and scalarizes the loads/stores. Scalar loads/stores will allow forwarding constant output components from stores to loads easily. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35352>
This commit is contained in:
parent
f407129b7f
commit
4b6ae11207
3 changed files with 57 additions and 73 deletions
|
|
@ -228,13 +228,18 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
|
|||
nir_def *buffer_offsets_ret[4],
|
||||
nir_def *emit_prim_ret[4]);
|
||||
|
||||
unsigned
|
||||
ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component);
|
||||
|
||||
unsigned
|
||||
ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
|
||||
bool data_is_16bit);
|
||||
|
||||
void
|
||||
ac_nir_store_shared_gs_out(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
gl_varying_slot slot, unsigned component);
|
||||
|
||||
nir_def *
|
||||
ac_nir_load_shared_gs_out(nir_builder *b, unsigned bit_size, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
gl_varying_slot slot, unsigned component);
|
||||
|
||||
void
|
||||
ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
|
||||
unsigned stream, nir_def *so_buffer[4],
|
||||
|
|
|
|||
|
|
@ -164,28 +164,14 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
|
|||
*/
|
||||
u_foreach_bit64(slot, b->shader->info.outputs_written) {
|
||||
unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], stream);
|
||||
|
||||
nir_def **output = s->out.outputs[slot];
|
||||
nir_def *undef = nir_undef(b, 1, 32);
|
||||
|
||||
while (mask) {
|
||||
int start, count;
|
||||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
nir_def *values[4] = {0};
|
||||
for (int c = start; c < start + count; ++c) {
|
||||
if (!output[c]) {
|
||||
/* The shader hasn't written this output. */
|
||||
values[c - start] = undef;
|
||||
} else {
|
||||
assert(output[c]->bit_size == 32);
|
||||
values[c - start] = output[c];
|
||||
}
|
||||
}
|
||||
u_foreach_bit(c, mask) {
|
||||
/* The shader hasn't written this output yet. */
|
||||
if (!output[c])
|
||||
continue;
|
||||
|
||||
nir_def *store_val = nir_vec(b, values, (unsigned)count);
|
||||
nir_store_shared(b, store_val, gs_emit_vtx_addr,
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
|
||||
.align_mul = 4);
|
||||
ac_nir_store_shared_gs_out(b, output[c], gs_emit_vtx_addr, &s->out, slot, c);
|
||||
}
|
||||
|
||||
/* Clear all outputs (they are undefined after emit_vertex) */
|
||||
|
|
@ -202,21 +188,16 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri
|
|||
nir_def **output_hi = s->out.outputs_16bit_hi[slot];
|
||||
nir_def *undef = nir_undef(b, 1, 16);
|
||||
|
||||
while (mask) {
|
||||
int start, count;
|
||||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
nir_def *values[4] = {0};
|
||||
for (int c = start; c < start + count; ++c) {
|
||||
nir_def *lo = output_lo[c] ? output_lo[c] : undef;
|
||||
nir_def *hi = output_hi[c] ? output_hi[c] : undef;
|
||||
u_foreach_bit(c, mask) {
|
||||
/* The shader hasn't written this output yet. */
|
||||
if (!output_lo[c] && !output_hi[c])
|
||||
continue;
|
||||
|
||||
values[c - start] = nir_pack_32_2x16_split(b, lo, hi);
|
||||
}
|
||||
nir_def *lo = output_lo[c] ? output_lo[c] : undef;
|
||||
nir_def *hi = output_hi[c] ? output_hi[c] : undef;
|
||||
nir_def *store_val = nir_pack_32_2x16_split(b, lo, hi);
|
||||
|
||||
nir_def *store_val = nir_vec(b, values, (unsigned)count);
|
||||
nir_store_shared(b, store_val, gs_emit_vtx_addr,
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot, start),
|
||||
.align_mul = 4);
|
||||
ac_nir_store_shared_gs_out(b, store_val, gs_emit_vtx_addr, &s->out, VARYING_SLOT_VAR0_16BIT + slot, c);
|
||||
}
|
||||
|
||||
/* Clear all outputs (they are undefined after emit_vertex) */
|
||||
|
|
@ -383,16 +364,9 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
|
|||
u_foreach_bit64(slot, b->shader->info.outputs_written) {
|
||||
unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], 0);
|
||||
|
||||
while (mask) {
|
||||
int start, count;
|
||||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
nir_def *load =
|
||||
nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, slot, start),
|
||||
.align_mul = 4);
|
||||
|
||||
for (int i = 0; i < count; i++)
|
||||
s->out.outputs[slot][start + i] = nir_channel(b, load, i);
|
||||
u_foreach_bit(c, mask) {
|
||||
s->out.outputs[slot][c] = ac_nir_load_shared_gs_out(b, 32, exported_out_vtx_lds_addr,
|
||||
&s->out, slot, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -402,24 +376,15 @@ ngg_gs_process_out_vertex(nir_builder *b, nir_def *out_vtx_lds_addr, lower_ngg_g
|
|||
const unsigned mask_hi = gs_output_component_mask_with_stream(&s->out.infos_16bit_hi[i], 0);
|
||||
unsigned mask = mask_lo | mask_hi;
|
||||
|
||||
while (mask) {
|
||||
int start, count;
|
||||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
nir_def *load =
|
||||
nir_load_shared(b, count, 32, exported_out_vtx_lds_addr,
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_VAR0_16BIT + i, start),
|
||||
.align_mul = 4);
|
||||
u_foreach_bit(c, mask) {
|
||||
nir_def *load_val = ac_nir_load_shared_gs_out(b, 32, exported_out_vtx_lds_addr,
|
||||
&s->out, VARYING_SLOT_VAR0_16BIT + i, c);
|
||||
|
||||
for (int j = 0; j < count; j++) {
|
||||
nir_def *val = nir_channel(b, load, j);
|
||||
unsigned comp = start + j;
|
||||
if (mask_lo & BITFIELD_BIT(c))
|
||||
s->out.outputs_16bit_lo[i][c] = nir_unpack_32_2x16_split_x(b, load_val);
|
||||
|
||||
if (mask_lo & BITFIELD_BIT(comp))
|
||||
s->out.outputs_16bit_lo[i][comp] = nir_unpack_32_2x16_split_x(b, val);
|
||||
|
||||
if (mask_hi & BITFIELD_BIT(comp))
|
||||
s->out.outputs_16bit_hi[i][comp] = nir_unpack_32_2x16_split_y(b, val);
|
||||
}
|
||||
if (mask_hi & BITFIELD_BIT(c))
|
||||
s->out.outputs_16bit_hi[i][c] = nir_unpack_32_2x16_split_y(b, load_val);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -630,11 +595,8 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,
|
|||
|
||||
for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) {
|
||||
/* Load X, Y, W position components. Z is loaded only if we clip against POS. */
|
||||
for (unsigned c = 0; c < 4; c == 1 && !clip_against_pos ? c += 2 : c++) {
|
||||
pos[i][c] = nir_load_shared(b, 1, 32, vtxptr[i],
|
||||
.base = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_POS, c),
|
||||
.align_mul = 4);
|
||||
}
|
||||
for (unsigned c = 0; c < 4; c == 1 && !clip_against_pos ? c += 2 : c++)
|
||||
pos[i][c] = ac_nir_load_shared_gs_out(b, 32, vtxptr[i], &s->out, VARYING_SLOT_POS, c);
|
||||
}
|
||||
|
||||
nir_def *accepted_by_clipdist = nir_imm_true(b);
|
||||
|
|
@ -650,8 +612,8 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,
|
|||
if (!clip_against_pos) {
|
||||
for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) {
|
||||
for (unsigned c = 0; c < 4; c++) {
|
||||
unsigned offset = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_CLIP_VERTEX, c);
|
||||
clipvertex[i][c] = nir_load_shared(b, 1, 32, vtxptr[i], .base = offset, .align_mul = 4);
|
||||
clipvertex[i][c] = ac_nir_load_shared_gs_out(b, 32, vtxptr[i], &s->out,
|
||||
VARYING_SLOT_CLIP_VERTEX, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -668,10 +630,10 @@ ngg_gs_cull_primitive(nir_builder *b, nir_def *tid_in_tg, nir_def *max_vtxcnt,
|
|||
} else {
|
||||
/* Load clip distances. */
|
||||
u_foreach_bit(c, s->options->cull_clipdist_mask) {
|
||||
unsigned offset = ac_nir_get_lds_gs_out_slot_offset(&s->out, VARYING_SLOT_CLIP_DIST0 + c / 4, c % 4);
|
||||
|
||||
for (unsigned i = 0; i < s->num_vertices_per_primitive; i++)
|
||||
clipdist[i][c] = nir_load_shared(b, 1, 32, vtxptr[i], .base = offset, .align_mul = 4);
|
||||
for (unsigned i = 0; i < s->num_vertices_per_primitive; i++) {
|
||||
clipdist[i][c] = ac_nir_load_shared_gs_out(b, 32, vtxptr[i], &s->out,
|
||||
VARYING_SLOT_CLIP_DIST0 + c / 4, c % 4);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1310,7 +1310,7 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
|
|||
}
|
||||
}
|
||||
|
||||
unsigned
|
||||
static unsigned
|
||||
ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component)
|
||||
{
|
||||
assert(component < 4);
|
||||
|
|
@ -1357,6 +1357,23 @@ ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot,
|
|||
return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
|
||||
}
|
||||
|
||||
void
|
||||
ac_nir_store_shared_gs_out(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
gl_varying_slot slot, unsigned component)
|
||||
{
|
||||
assert(value->num_components == 1);
|
||||
unsigned offset = ac_nir_get_lds_gs_out_slot_offset(pr_out, slot, component);
|
||||
nir_store_shared(b, value, vtxptr, .base = offset, .align_mul = 4);
|
||||
}
|
||||
|
||||
nir_def *
|
||||
ac_nir_load_shared_gs_out(nir_builder *b, unsigned bit_size, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
gl_varying_slot slot, unsigned component)
|
||||
{
|
||||
unsigned offset = ac_nir_get_lds_gs_out_slot_offset(pr_out, slot, component);
|
||||
return nir_load_shared(b, 1, bit_size, vtxptr, .base = offset, .align_mul = 4);
|
||||
}
|
||||
|
||||
void
|
||||
ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
|
||||
unsigned stream, nir_def *so_buffer[4],
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue