mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-21 04:38:09 +02:00
This lets us emit the VPM_WRITEs directly from nir_intrinsic_store_output() (useful once NIR scheduling is in place so that we can reduce register pressure), and lets future NIR scheduling schedule the math to generate them. Even in the meantime, it looks like this lets NIR DCE some more code and make better decisions. total instructions in shared programs: 6429246 -> 6412976 (-0.25%) total threads in shared programs: 153924 -> 153934 (<.01%) total loops in shared programs: 486 -> 483 (-0.62%) total uniforms in shared programs: 2385436 -> 2388195 (0.12%) Acked-by: Ian Romanick <ian.d.romanick@intel.com> (nir)
350 lines
13 KiB
C
350 lines
13 KiB
C
/*
|
|
* Copyright © 2015 Broadcom
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
#include "compiler/v3d_compiler.h"
|
|
#include "compiler/nir/nir_builder.h"
|
|
|
|
/**
|
|
* Walks the NIR generated by TGSI-to-NIR or GLSL-to-NIR to lower its io
|
|
* intrinsics into something amenable to the V3D architecture.
|
|
*
|
|
* Most of the work is turning the VS's store_output intrinsics from working
|
|
* on a base representing the gallium-level vec4 driver_location to an offset
|
|
* within the VPM, and emitting the header that's read by the fixed function
|
|
* hardware between the VS and FS.
|
|
*
|
|
* We also adjust the offsets on uniform loads to be in bytes, since that's
|
|
* what we need for indirect addressing with general TMU access.
|
|
*/
|
|
|
|
struct v3d_nir_lower_io_state {
|
|
int pos_vpm_offset;
|
|
int vp_vpm_offset;
|
|
int zs_vpm_offset;
|
|
int rcp_wc_vpm_offset;
|
|
int psiz_vpm_offset;
|
|
int varyings_vpm_offset;
|
|
|
|
BITSET_WORD varyings_stored[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
|
|
|
|
nir_ssa_def *pos[4];
|
|
};
|
|
|
|
static void
|
|
v3d_nir_store_output(nir_builder *b, int base, nir_ssa_def *chan)
|
|
{
|
|
nir_intrinsic_instr *intr =
|
|
nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
|
|
nir_ssa_dest_init(&intr->instr, &intr->dest,
|
|
1, intr->dest.ssa.bit_size, NULL);
|
|
intr->num_components = 1;
|
|
|
|
intr->src[0] = nir_src_for_ssa(chan);
|
|
intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
|
|
|
|
nir_intrinsic_set_base(intr, base);
|
|
nir_intrinsic_set_write_mask(intr, 0x1);
|
|
nir_intrinsic_set_component(intr, 0);
|
|
|
|
nir_builder_instr_insert(b, &intr->instr);
|
|
}
|
|
|
|
/* Convert the uniform offset to bytes. If it happens to be a constant,
|
|
* constant-folding will clean up the shift for us.
|
|
*/
|
|
static void
|
|
v3d_nir_lower_uniform(struct v3d_compile *c, nir_builder *b,
|
|
nir_intrinsic_instr *intr)
|
|
{
|
|
b->cursor = nir_before_instr(&intr->instr);
|
|
|
|
nir_intrinsic_set_base(intr, nir_intrinsic_base(intr) * 16);
|
|
|
|
nir_instr_rewrite_src(&intr->instr,
|
|
&intr->src[0],
|
|
nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
|
|
nir_imm_int(b, 4))));
|
|
}
|
|
|
|
static int
|
|
v3d_varying_slot_vpm_offset(struct v3d_compile *c, nir_variable *var, int chan)
|
|
{
|
|
int component = var->data.location_frac + chan;
|
|
|
|
for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
|
|
struct v3d_varying_slot slot = c->vs_key->fs_inputs[i];
|
|
|
|
if (v3d_slot_get_slot(slot) == var->data.location &&
|
|
v3d_slot_get_component(slot) == component) {
|
|
return i;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Lowers a store_output(gallium driver location) to a series of store_outputs
|
|
* with a driver_location equal to the offset in the VPM.
|
|
*/
|
|
static void
|
|
v3d_nir_lower_vpm_output(struct v3d_compile *c, nir_builder *b,
|
|
nir_intrinsic_instr *intr,
|
|
struct v3d_nir_lower_io_state *state)
|
|
{
|
|
b->cursor = nir_before_instr(&intr->instr);
|
|
|
|
int start_comp = nir_intrinsic_component(intr);
|
|
nir_ssa_def *src = nir_ssa_for_src(b, intr->src[0],
|
|
intr->num_components);
|
|
|
|
nir_variable *var = NULL;
|
|
nir_foreach_variable(scan_var, &c->s->outputs) {
|
|
if (scan_var->data.driver_location != nir_intrinsic_base(intr) ||
|
|
start_comp < scan_var->data.location_frac ||
|
|
start_comp >= scan_var->data.location_frac +
|
|
glsl_get_components(scan_var->type)) {
|
|
continue;
|
|
}
|
|
var = scan_var;
|
|
}
|
|
|
|
/* Save off the components of the position for the setup of VPM inputs
|
|
* read by fixed function HW.
|
|
*/
|
|
if (var->data.location == VARYING_SLOT_POS) {
|
|
for (int i = 0; i < intr->num_components; i++) {
|
|
state->pos[start_comp + i] = nir_channel(b, src, i);
|
|
}
|
|
}
|
|
|
|
/* Just psiz to the position in the FF header right now. */
|
|
if (var->data.location == VARYING_SLOT_PSIZ &&
|
|
state->psiz_vpm_offset != -1) {
|
|
v3d_nir_store_output(b, state->psiz_vpm_offset, src);
|
|
}
|
|
|
|
/* Scalarize outputs if it hasn't happened already, since we want to
|
|
* schedule each VPM write individually. We can skip any outut
|
|
* components not read by the FS.
|
|
*/
|
|
for (int i = 0; i < intr->num_components; i++) {
|
|
int vpm_offset =
|
|
v3d_varying_slot_vpm_offset(c, var,
|
|
i +
|
|
start_comp -
|
|
var->data.location_frac);
|
|
|
|
if (vpm_offset == -1)
|
|
continue;
|
|
|
|
BITSET_SET(state->varyings_stored, vpm_offset);
|
|
|
|
v3d_nir_store_output(b, state->varyings_vpm_offset + vpm_offset,
|
|
nir_channel(b, src, i));
|
|
}
|
|
|
|
nir_instr_remove(&intr->instr);
|
|
}
|
|
|
|
static void
|
|
v3d_nir_lower_io_instr(struct v3d_compile *c, nir_builder *b,
|
|
struct nir_instr *instr,
|
|
struct v3d_nir_lower_io_state *state)
|
|
{
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
return;
|
|
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
|
|
|
switch (intr->intrinsic) {
|
|
case nir_intrinsic_load_uniform:
|
|
v3d_nir_lower_uniform(c, b, intr);
|
|
break;
|
|
|
|
case nir_intrinsic_store_output:
|
|
if (c->s->info.stage == MESA_SHADER_VERTEX)
|
|
v3d_nir_lower_vpm_output(c, b, intr, state);
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Remap the output var's .driver_location. This is purely for
|
|
* nir_print_shader() so that store_output can map back to a variable name.
|
|
*/
|
|
static void
|
|
v3d_nir_lower_io_update_output_var_base(struct v3d_compile *c,
|
|
struct v3d_nir_lower_io_state *state)
|
|
{
|
|
nir_foreach_variable_safe(var, &c->s->outputs) {
|
|
if (var->data.location == VARYING_SLOT_POS &&
|
|
state->pos_vpm_offset != -1) {
|
|
var->data.driver_location = state->pos_vpm_offset;
|
|
continue;
|
|
}
|
|
|
|
if (var->data.location == VARYING_SLOT_PSIZ &&
|
|
state->psiz_vpm_offset != -1) {
|
|
var->data.driver_location = state->psiz_vpm_offset;
|
|
continue;
|
|
}
|
|
|
|
int vpm_offset = v3d_varying_slot_vpm_offset(c, var, 0);
|
|
if (vpm_offset != -1) {
|
|
var->data.driver_location =
|
|
state->varyings_vpm_offset + vpm_offset;
|
|
} else {
|
|
/* If we couldn't find a mapping for the var, delete
|
|
* it so that its old .driver_location doesn't confuse
|
|
* nir_print_shader().
|
|
*/
|
|
exec_node_remove(&var->node);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
v3d_nir_setup_vpm_layout(struct v3d_compile *c,
|
|
struct v3d_nir_lower_io_state *state)
|
|
{
|
|
uint32_t vpm_offset = 0;
|
|
|
|
if (c->vs_key->is_coord) {
|
|
state->pos_vpm_offset = vpm_offset;
|
|
vpm_offset += 4;
|
|
} else {
|
|
state->pos_vpm_offset = -1;
|
|
}
|
|
|
|
state->vp_vpm_offset = vpm_offset;
|
|
vpm_offset += 2;
|
|
|
|
if (!c->vs_key->is_coord) {
|
|
state->zs_vpm_offset = vpm_offset++;
|
|
state->rcp_wc_vpm_offset = vpm_offset++;
|
|
} else {
|
|
state->zs_vpm_offset = -1;
|
|
state->rcp_wc_vpm_offset = -1;
|
|
}
|
|
|
|
if (c->vs_key->per_vertex_point_size)
|
|
state->psiz_vpm_offset = vpm_offset++;
|
|
else
|
|
state->psiz_vpm_offset = -1;
|
|
|
|
state->varyings_vpm_offset = vpm_offset;
|
|
|
|
c->vpm_output_size = vpm_offset + c->vs_key->num_fs_inputs;
|
|
}
|
|
|
|
static void
|
|
v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
|
|
struct v3d_nir_lower_io_state *state)
|
|
{
|
|
for (int i = 0; i < 4; i++) {
|
|
if (!state->pos[i])
|
|
state->pos[i] = nir_ssa_undef(b, 1, 32);
|
|
}
|
|
|
|
nir_ssa_def *rcp_wc = nir_frcp(b, state->pos[3]);
|
|
|
|
if (state->pos_vpm_offset != -1) {
|
|
for (int i = 0; i < 4; i++) {
|
|
v3d_nir_store_output(b, state->pos_vpm_offset + i,
|
|
state->pos[i]);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
nir_ssa_def *pos;
|
|
nir_ssa_def *scale;
|
|
pos = state->pos[i];
|
|
if (i == 0)
|
|
scale = nir_load_viewport_x_scale(b);
|
|
else
|
|
scale = nir_load_viewport_y_scale(b);
|
|
pos = nir_fmul(b, pos, scale);
|
|
pos = nir_fmul(b, pos, rcp_wc);
|
|
pos = nir_f2i32(b, nir_fround_even(b, pos));
|
|
v3d_nir_store_output(b, state->vp_vpm_offset + i,
|
|
pos);
|
|
}
|
|
|
|
if (state->zs_vpm_offset != -1) {
|
|
nir_ssa_def *z = state->pos[2];
|
|
z = nir_fmul(b, z, nir_load_viewport_z_scale(b));
|
|
z = nir_fmul(b, z, rcp_wc);
|
|
z = nir_fadd(b, z, nir_load_viewport_z_offset(b));
|
|
v3d_nir_store_output(b, state->zs_vpm_offset, z);
|
|
}
|
|
|
|
if (state->rcp_wc_vpm_offset != -1)
|
|
v3d_nir_store_output(b, state->rcp_wc_vpm_offset, rcp_wc);
|
|
|
|
/* Store 0 to varyings requested by the FS but not stored in the VS.
|
|
* This should be undefined behavior, but glsl-routing seems to rely
|
|
* on it.
|
|
*/
|
|
for (int i = 0; i < c->vs_key->num_fs_inputs; i++) {
|
|
if (!BITSET_TEST(state->varyings_stored, i)) {
|
|
v3d_nir_store_output(b, state->varyings_vpm_offset + i,
|
|
nir_imm_int(b, 0));
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
v3d_nir_lower_io(nir_shader *s, struct v3d_compile *c)
|
|
{
|
|
struct v3d_nir_lower_io_state state = { 0 };
|
|
|
|
/* Set up the layout of the VPM outputs. */
|
|
if (s->info.stage == MESA_SHADER_VERTEX)
|
|
v3d_nir_setup_vpm_layout(c, &state);
|
|
|
|
nir_foreach_function(function, s) {
|
|
if (function->impl) {
|
|
nir_builder b;
|
|
nir_builder_init(&b, function->impl);
|
|
|
|
nir_foreach_block(block, function->impl) {
|
|
nir_foreach_instr_safe(instr, block)
|
|
v3d_nir_lower_io_instr(c, &b, instr,
|
|
&state);
|
|
}
|
|
|
|
nir_block *last = nir_impl_last_block(function->impl);
|
|
b.cursor = nir_after_block(last);
|
|
if (s->info.stage == MESA_SHADER_VERTEX)
|
|
v3d_nir_emit_ff_vpm_outputs(c, &b, &state);
|
|
|
|
nir_metadata_preserve(function->impl,
|
|
nir_metadata_block_index |
|
|
nir_metadata_dominance);
|
|
}
|
|
}
|
|
|
|
if (s->info.stage == MESA_SHADER_VERTEX)
|
|
v3d_nir_lower_io_update_output_var_base(c, &state);
|
|
}
|