ir3: Skip missing VS outputs in VS out map when linking

The hardware is capable of automatically filling in certain values in
the VPC without writing them from the last geometry stage, like
gl_PointCoord or gl_PrimitiveID when there is no GS. However, we *do*
have to enable these outputs (i.e. set the VPC_VAR_DISABLE bit to 0) as
VPC_VAR_DISABLE is really about FS inputs rather than VS outputs. To do
this, we move the computation of the enable bits to ir3_link_add(),
which is also a nice refactor anyway. In addition we detect the PrimID
case specifically so that the driver can program the location.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4704>
This commit is contained in:
Connor Abbott 2020-04-22 17:54:41 +02:00 committed by Marge Bot
parent cc530858c1
commit 1f9839907a
6 changed files with 67 additions and 48 deletions

View file

@ -699,7 +699,7 @@ ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
} else if (slot == VARYING_SLOT_COL1) {
slot = VARYING_SLOT_BFC1;
} else {
return 0;
return -1;
}
for (j = 0; j < so->outputs_count; j++)
@ -708,7 +708,7 @@ ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot)
debug_assert(0);
return 0;
return -1;
}
static inline int
@ -721,35 +721,71 @@ ir3_next_varying(const struct ir3_shader_variant *so, int i)
}
struct ir3_shader_linkage {
/* Maximum location either consumed by the fragment shader or produced by
* the last geometry stage, i.e. the size required for each vertex in the
* VPC in DWORD's.
*/
uint8_t max_loc;
/* Number of entries in var. */
uint8_t cnt;
/* Bitset of locations used, including ones which are only used by the FS.
*/
uint32_t varmask[4];
/* Map from VS output to location. */
struct {
uint8_t regid;
uint8_t compmask;
uint8_t loc;
} var[32];
/* location for fixed-function gl_PrimitiveID passthrough */
uint8_t primid_loc;
};
static inline void
ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid, uint8_t compmask, uint8_t loc)
ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid_, uint8_t compmask, uint8_t loc)
{
int i = l->cnt++;
debug_assert(i < ARRAY_SIZE(l->var));
l->var[i].regid = regid;
l->var[i].compmask = compmask;
l->var[i].loc = loc;
for (int j = 0; j < util_last_bit(compmask); j++) {
uint8_t comploc = loc + j;
l->varmask[comploc / 32] |= 1 << (comploc % 32);
}
l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask));
if (regid_ != regid(63, 0)) {
int i = l->cnt++;
debug_assert(i < ARRAY_SIZE(l->var));
l->var[i].regid = regid_;
l->var[i].compmask = compmask;
l->var[i].loc = loc;
}
}
static inline void
ir3_link_shaders(struct ir3_shader_linkage *l,
const struct ir3_shader_variant *vs,
const struct ir3_shader_variant *fs)
const struct ir3_shader_variant *fs,
bool pack_vs_out)
{
/* On older platforms, varmask isn't programmed at all, and it appears
* that the hardware generates a mask of used VPC locations using the VS
* output map, and hangs if a FS bary instruction references a location
* not in the list. This means that we need to have a dummy entry in the
* VS out map for things like gl_PointCoord which aren't written by the
* VS. Furthermore we can't use r63.x, so just pick a random register to
* use if there is no VS output.
*/
const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0);
int j = -1, k;
l->primid_loc = 0xff;
while (l->cnt < ARRAY_SIZE(l->var)) {
j = ir3_next_varying(fs, j);
@ -761,7 +797,11 @@ ir3_link_shaders(struct ir3_shader_linkage *l,
k = ir3_find_output(vs, fs->inputs[j].slot);
ir3_link_add(l, vs->outputs[k].regid,
if (k < 0 && fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) {
l->primid_loc = fs->inputs[j].inloc;
}
ir3_link_add(l, k >= 0 ? vs->outputs[k].regid : default_regid,
fs->inputs[j].compmask, fs->inputs[j].inloc);
}
}

View file

@ -929,23 +929,16 @@ tu6_emit_vpc(struct tu_cs *cs,
bool has_gs = gs->type != MESA_SHADER_NONE;
const struct ir3_shader_variant *last_shader = has_gs ? gs : vs;
struct ir3_shader_linkage linkage = { 0 };
ir3_link_shaders(&linkage, last_shader, fs);
ir3_link_shaders(&linkage, last_shader, fs, true);
if (last_shader->shader->stream_output.num_outputs)
tu6_link_streamout(&linkage, last_shader);
BITSET_DECLARE(vpc_var_enables, 128) = { 0 };
for (uint32_t i = 0; i < linkage.cnt; i++) {
const uint32_t comp_count = util_last_bit(linkage.var[i].compmask);
for (uint32_t j = 0; j < comp_count; j++)
BITSET_SET(vpc_var_enables, linkage.var[i].loc + j);
}
tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
tu_cs_emit(cs, ~vpc_var_enables[0]);
tu_cs_emit(cs, ~vpc_var_enables[1]);
tu_cs_emit(cs, ~vpc_var_enables[2]);
tu_cs_emit(cs, ~vpc_var_enables[3]);
tu_cs_emit(cs, ~linkage.varmask[0]);
tu_cs_emit(cs, ~linkage.varmask[1]);
tu_cs_emit(cs, ~linkage.varmask[2]);
tu_cs_emit(cs, ~linkage.varmask[3]);
/* a6xx finds position/pointsize at the end */
const uint32_t position_regid =

View file

@ -237,7 +237,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp->varying_in));
struct ir3_shader_linkage l = {0};
ir3_link_shaders(&l, vp, fp);
ir3_link_shaders(&l, vp, fp, false);
for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) {
uint32_t reg = 0;

View file

@ -289,7 +289,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(s[FS].v->varying_in));
struct ir3_shader_linkage l = {0};
ir3_link_shaders(&l, s[VS].v, s[FS].v);
ir3_link_shaders(&l, s[VS].v, s[FS].v, false);
for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) {
uint32_t reg = 0;

View file

@ -410,24 +410,17 @@ fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
COND(s[VS].v->num_samp > 0, A5XX_SP_VS_CTRL_REG0_PIXLODENABLE));
struct ir3_shader_linkage l = {0};
ir3_link_shaders(&l, s[VS].v, s[FS].v);
ir3_link_shaders(&l, s[VS].v, s[FS].v, true);
if ((s[VS].v->shader->stream_output.num_outputs > 0) &&
!emit->binning_pass)
link_stream_out(&l, s[VS].v);
BITSET_DECLARE(varbs, 128) = {0};
uint32_t *varmask = (uint32_t *)varbs;
for (i = 0; i < l.cnt; i++)
for (j = 0; j < util_last_bit(l.var[i].compmask); j++)
BITSET_SET(varbs, l.var[i].loc + j);
OUT_PKT4(ring, REG_A5XX_VPC_VAR_DISABLE(0), 4);
OUT_RING(ring, ~varmask[0]); /* VPC_VAR[0].DISABLE */
OUT_RING(ring, ~varmask[1]); /* VPC_VAR[1].DISABLE */
OUT_RING(ring, ~varmask[2]); /* VPC_VAR[2].DISABLE */
OUT_RING(ring, ~varmask[3]); /* VPC_VAR[3].DISABLE */
OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */
OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */
OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */
OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */
/* a5xx appends pos/psize to end of the linkage map: */
if (pos_regid != regid(63,0))

View file

@ -429,20 +429,13 @@ setup_stateobj(struct fd_ringbuffer *ring, struct fd_screen *screen,
struct ir3_shader_linkage l = {0};
const struct ir3_shader_variant *last_shader = fd6_last_shader(state);
ir3_link_shaders(&l, last_shader, fs);
BITSET_DECLARE(varbs, 128) = {0};
uint32_t *varmask = (uint32_t *)varbs;
for (i = 0; i < l.cnt; i++)
for (j = 0; j < util_last_bit(l.var[i].compmask); j++)
BITSET_SET(varbs, l.var[i].loc + j);
ir3_link_shaders(&l, last_shader, fs, true);
OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4);
OUT_RING(ring, ~varmask[0]); /* VPC_VAR[0].DISABLE */
OUT_RING(ring, ~varmask[1]); /* VPC_VAR[1].DISABLE */
OUT_RING(ring, ~varmask[2]); /* VPC_VAR[2].DISABLE */
OUT_RING(ring, ~varmask[3]); /* VPC_VAR[3].DISABLE */
OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */
OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */
OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */
OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */
/* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */
if (last_shader->shader->stream_output.num_outputs > 0)