nir: Generate load_ubo_vec4 directly for !PIPE_CAP_NATIVE_INTEGERS

The prog_to_nir->NIR-to-TGSI change ended up causing regressions on r300,
and svga against r300-class hardware, because nir_lower_uniforms_to_ubo()
introduced shifts that nir_lower_ubo_vec4() tried to reverse, but that NIR
couldn't prove are no-ops (since shifting up and back down may drop bits),
and the hardware can't do the integer ops.

Instead, make it so that nir_lower_uniforms_to_ubo can generate
nir_intrinsic_load_ubo_vec4 directly for !INTEGER hardware.

Fixes: cf3fc79cd0 ("st/mesa: Replace mesa_to_tgsi() with prog_to_nir() and nir_to_tgsi().")
Closes: #4602
Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10194>
This commit is contained in:
Eric Anholt 2021-04-09 16:10:30 -07:00 committed by Marge Bot
parent 71d6d1b1ab
commit 5de3cbbb2e
11 changed files with 67 additions and 63 deletions

View file

@ -481,7 +481,8 @@ The integer capabilities:
those bits set, pipe_context::set_constant_buffer(.., 0, ..) is ignored
by the driver, and the driver can throw assertion failures.
* ``PIPE_CAP_PACKED_UNIFORMS``: True if the driver supports packed uniforms
as opposed to padding to vec4s.
as opposed to padding to vec4s. Requires ``PIPE_SHADER_CAP_INTEGERS`` if
``lower_uniforms_to_ubo`` is set.
* ``PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES``: Whether the
``PIPE_CONSERVATIVE_RASTER_POST_SNAP`` mode is supported for triangles.
The post-snap mode means the conservative rasterization occurs after

View file

@ -3435,8 +3435,6 @@ typedef struct nir_shader_compiler_options {
unsigned max_unroll_iterations;
unsigned max_unroll_iterations_aggressive;
/* For the non-zero value of the enum corresponds multiplier when
* calling lower_uniforms_to_ubo */
bool lower_uniforms_to_ubo;
nir_lower_int64_options lower_int64_options;
@ -4633,7 +4631,7 @@ bool nir_vectorize_tess_levels(nir_shader *shader);
bool nir_lower_fragcolor(nir_shader *shader);
bool nir_lower_fragcoord_wtrans(nir_shader *shader);
void nir_lower_viewport_transform(nir_shader *shader);
bool nir_lower_uniforms_to_ubo(nir_shader *shader, int multiplier);
bool nir_lower_uniforms_to_ubo(nir_shader *shader, bool dword_packed, bool load_vec4);
typedef struct nir_lower_subgroups_options {
uint8_t subgroup_size;

View file

@ -22,27 +22,24 @@
*/
/*
* Remap load_uniform intrinsics to UBO accesses of UBO binding point 0.
* Simultaneously, remap existing UBO accesses by increasing their binding
* point by 1.
* Remap load_uniform intrinsics to nir_load_ubo or nir_load_ubo_vec4 accesses
* of UBO binding point 0. Simultaneously, remap existing UBO accesses by
* increasing their binding point by 1.
*
* Note that nir_intrinsic_load_uniform base/ranges can be set in different
* units, and the multiplier argument caters to supporting these different
* units.
* For PIPE_CAP_PACKED_UNIFORMS, dword_packed should be set to indicate that
* nir_intrinsic_load_uniform is in increments of dwords instead of vec4s.
*
* For example:
* - st_glsl_to_nir for PIPE_CAP_PACKED_UNIFORMS uses dwords (4 bytes) so the
* multiplier should be 4
* - st_glsl_to_nir for !PIPE_CAP_PACKED_UNIFORMS uses vec4s so the
* multiplier should be 16
* - tgsi_to_nir uses vec4s, so the multiplier should be 16
* If load_vec4 is set, then nir_intrinsic_load_ubo_vec4 will be generated
* instead of nir_intrinsic_load_ubo, saving addressing math for hardawre
* needing aligned vec4 loads in increments of vec4s (such as TGSI CONST file
* loads).
*/
#include "nir.h"
#include "nir_builder.h"
static bool
lower_instr(nir_intrinsic_instr *instr, nir_builder *b, int multiplier)
lower_instr(nir_intrinsic_instr *instr, nir_builder *b, bool dword_packed, bool load_vec4)
{
b->cursor = nir_before_instr(&instr->instr);
@ -58,43 +55,51 @@ lower_instr(nir_intrinsic_instr *instr, nir_builder *b, int multiplier)
if (instr->intrinsic == nir_intrinsic_load_uniform) {
nir_ssa_def *ubo_idx = nir_imm_int(b, 0);
nir_ssa_def *ubo_offset =
nir_iadd(b, nir_imm_int(b, multiplier * nir_intrinsic_base(instr)),
nir_imul(b, nir_imm_int(b, multiplier),
nir_ssa_for_src(b, instr->src[0], 1)));
nir_ssa_def *uniform_offset = nir_ssa_for_src(b, instr->src[0], 1);
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
load->num_components = instr->num_components;
load->src[0] = nir_src_for_ssa(ubo_idx);
load->src[1] = nir_src_for_ssa(ubo_offset);
assert(instr->dest.ssa.bit_size >= 8);
/* If it's const, set the alignment to our known constant offset. If
* not, set it to a pessimistic value based on the multiplier (or the
* scalar size, for qword loads).
*
* We could potentially set up stricter alignments for indirects by
* knowing what features are enabled in the APIs (see comment in
* nir_lower_ubo_vec4.c)
*/
if (nir_src_is_const(instr->src[0])) {
nir_intrinsic_set_align(load, NIR_ALIGN_MUL_MAX,
(nir_src_as_uint(instr->src[0]) +
nir_intrinsic_base(instr) * multiplier) %
NIR_ALIGN_MUL_MAX);
nir_ssa_def *load_result;
if (load_vec4) {
/* No asking us to generate load_vec4 when you've packed your uniforms
* as dwords instead of vec4s.
*/
assert(!dword_packed);
load_result = nir_load_ubo_vec4(b, instr->num_components, instr->dest.ssa.bit_size,
ubo_idx,
nir_iadd_imm(b, uniform_offset, nir_intrinsic_base(instr)));
} else {
nir_intrinsic_set_align(load, MAX2(multiplier,
instr->dest.ssa.bit_size / 8), 0);
}
nir_ssa_dest_init(&load->instr, &load->dest,
load->num_components, instr->dest.ssa.bit_size,
instr->dest.ssa.name);
nir_builder_instr_insert(b, &load->instr);
nir_ssa_def_rewrite_uses(&instr->dest.ssa, &load->dest.ssa);
/* For PIPE_CAP_PACKED_UNIFORMS, the uniforms are packed with the
* base/offset in dword units instead of vec4 units.
*/
int multiplier = dword_packed ? 4 : 16;
load_result = nir_load_ubo(b, instr->num_components, instr->dest.ssa.bit_size,
ubo_idx,
nir_iadd_imm(b, nir_imul_imm(b, uniform_offset, multiplier),
nir_intrinsic_base(instr) * multiplier));
nir_intrinsic_instr *load = nir_instr_as_intrinsic(load_result->parent_instr);
nir_intrinsic_set_range_base(load, nir_intrinsic_base(instr) * multiplier);
nir_intrinsic_set_range(load, nir_intrinsic_range(instr) * multiplier);
/* If it's const, set the alignment to our known constant offset. If
* not, set it to a pessimistic value based on the multiplier (or the
* scalar size, for qword loads).
*
* We could potentially set up stricter alignments for indirects by
* knowing what features are enabled in the APIs (see comment in
* nir_lower_ubo_vec4.c)
*/
if (nir_src_is_const(instr->src[0])) {
nir_intrinsic_set_align(load, NIR_ALIGN_MUL_MAX,
(nir_src_as_uint(instr->src[0]) +
nir_intrinsic_base(instr) * multiplier) %
NIR_ALIGN_MUL_MAX);
} else {
nir_intrinsic_set_align(load, MAX2(multiplier,
instr->dest.ssa.bit_size / 8), 0);
}
nir_intrinsic_set_range_base(load, nir_intrinsic_base(instr) * multiplier);
nir_intrinsic_set_range(load, nir_intrinsic_range(instr) * multiplier);
}
nir_ssa_def_rewrite_uses(&instr->dest.ssa, load_result);
nir_instr_remove(&instr->instr);
return true;
@ -104,7 +109,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_builder *b, int multiplier)
}
bool
nir_lower_uniforms_to_ubo(nir_shader *shader, int multiplier)
nir_lower_uniforms_to_ubo(nir_shader *shader, bool dword_packed, bool load_vec4)
{
bool progress = false;
@ -117,7 +122,7 @@ nir_lower_uniforms_to_ubo(nir_shader *shader, int multiplier)
if (instr->type == nir_instr_type_intrinsic)
progress |= lower_instr(nir_instr_as_intrinsic(instr),
&builder,
multiplier);
dword_packed, load_vec4);
}
}

View file

@ -100,7 +100,7 @@ draw_create_vs_llvm(struct draw_context *draw,
vs->base.state.ir.nir = state->ir.nir;
nir_shader *nir = (nir_shader *)state->ir.nir;
if (!nir->options->lower_uniforms_to_ubo)
NIR_PASS_V(state->ir.nir, nir_lower_uniforms_to_ubo, 16);
NIR_PASS_V(state->ir.nir, nir_lower_uniforms_to_ubo, false, false);
nir_tgsi_scan_shader(state->ir.nir, &vs->base.info, true);
} else {
/* we make a private copy of the tokens */

View file

@ -2718,8 +2718,8 @@ nir_to_tgsi(struct nir_shader *s,
if (!original_options->lower_uniforms_to_ubo) {
NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS) ?
4 : 16);
screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
!native_integers);
}
/* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --

View file

@ -2496,7 +2496,7 @@ ttn_finalize_nir(struct ttn_compile *c, struct pipe_screen *screen)
}
if (nir->options->lower_uniforms_to_ubo)
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
if (!c->cap_samplers_as_deref)
NIR_PASS_V(nir, nir_lower_samplers);

View file

@ -147,7 +147,7 @@ compile_nir(struct d3d12_context *ctx, struct d3d12_shader_selector *sel,
uint32_t num_ubos_before_lower_to_ubo = nir->info.num_ubos;
uint32_t num_uniforms_before_lower_to_ubo = nir->num_uniforms;
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
shader->has_default_ubo0 = num_uniforms_before_lower_to_ubo > 0 &&
nir->info.num_ubos > num_ubos_before_lower_to_ubo;

View file

@ -123,7 +123,7 @@ TEST_F(nir_lower_ubo_test, basic)
nir_ssa_def *offset = nir_imm_int(&b, 4);
nir_load_uniform(&b, 1, 32, offset);
nir_lower_uniforms_to_ubo(b.shader, 16);
nir_lower_uniforms_to_ubo(b.shader, false, false);
nir_opt_constant_folding(b.shader);
ASSERT_TRUE(etna_nir_lower_ubo_to_uniform(b.shader));

View file

@ -914,7 +914,7 @@ zink_shader_finalize(struct pipe_screen *pscreen, void *nirptr, bool optimize)
tex_opts.lower_tg4_offsets = true;
NIR_PASS_V(nir, nir_lower_tex, &tex_opts);
}
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
if (nir->info.stage == MESA_SHADER_GEOMETRY)
NIR_PASS_V(nir, nir_lower_gs_intrinsics, nir_lower_gs_intrinsics_per_stream);
optimize_nir(nir);

View file

@ -1005,12 +1005,10 @@ st_unpacked_uniforms_type_size(const struct glsl_type *type, bool bindless)
void
st_nir_lower_uniforms(struct st_context *st, nir_shader *nir)
{
unsigned multiplier = 16;
if (st->ctx->Const.PackedDriverUniformStorage) {
NIR_PASS_V(nir, nir_lower_io, nir_var_uniform,
st_packed_uniforms_type_size,
(nir_lower_io_options)0);
multiplier = 4;
} else {
NIR_PASS_V(nir, nir_lower_io, nir_var_uniform,
st_unpacked_uniforms_type_size,
@ -1018,7 +1016,9 @@ st_nir_lower_uniforms(struct st_context *st, nir_shader *nir)
}
if (nir->options->lower_uniforms_to_ubo)
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, multiplier);
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo,
st->ctx->Const.PackedDriverUniformStorage,
!st->ctx->Const.NativeIntegers);
}
/* Last third of preparing nir from glsl, which happens after shader

View file

@ -338,7 +338,7 @@ optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend)
* to UBO ordinarily, but it isn't as aggressive as we need. */
NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, 16);
NIR_PASS_V(nir, nir_lower_uniforms_to_ubo, false, false);
do {
progress = false;