asahi: implement VBO robustness

GL semantics. GLES (weaker) and VK (stronger) semantics are left as a todo, with
explanations given. Enabled always to deal with null VBOs, this should be
optimized once we have soft fault.

This necessitates a rework of VBO keys, but hopefully for the best.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
This commit is contained in:
Alyssa Rosenzweig 2024-01-06 09:26:03 -04:00 committed by Marge Bot
parent 4aadf67523
commit 5dc0f5ccba
6 changed files with 152 additions and 28 deletions

View file

@ -171,7 +171,35 @@ pass(struct nir_builder *b, nir_instr *instr, void *data)
el = nir_load_vertex_id(b);
}
nir_def *base = nir_load_vbo_base_agx(b, nir_imm_int(b, attrib.buf));
/* VBO bases are per-attribute, otherwise they're per-buffer. This allows
* memory sinks to work properly with robustness, allows folding
* the src_offset into the VBO base to save an add in the shader, and reduces
* the size of the vertex fetch key. That last piece allows reusing a linked
* VS with both separate and interleaved attributes.
*/
nir_def *buf_handle = nir_imm_int(b, index);
/* Robustness is handled at the ID level */
nir_def *bounds = nir_load_attrib_clamp_agx(b, buf_handle);
/* For now, robustness is always applied. This gives GL robustness semantics.
* For robustBufferAccess2, we'll want to check for out-of-bounds access
* (where el > bounds), and replace base with the address of a zero sink.
* With soft fault and a large enough sink, we don't need to clamp the index,
* allowing that robustness behaviour to be implemented in 2 cmpsel
* before the load. That is faster than the 4 cmpsel required after the load,
* and it avoids waiting on the load which should help prolog performance.
*
* TODO: Plumb through soft fault information to skip this.
*
* TODO: Add a knob for robustBufferAccess2 semantics.
*/
bool robust = true;
if (robust) {
el = nir_umin(b, el, bounds);
}
nir_def *base = nir_load_vbo_base_agx(b, buf_handle);
assert((stride % interchange_align) == 0 && "must be aligned");
assert((offset % interchange_align) == 0 && "must be aligned");

View file

@ -1839,10 +1839,16 @@ store("agx", [1, 1], [ACCESS, BASE, FORMAT, SIGN_EXTEND])
# Logical complement of load_front_face, mapping to an AGX system value
system_value("back_face_agx", 1, bit_sizes=[1, 32])
# Load the base address of an indexed VBO (for lowering VBOs)
# Load the base address of an indexed vertex attribute (for lowering).
intrinsic("load_vbo_base_agx", src_comp=[1], dest_comp=1, bit_sizes=[64],
flags=[CAN_ELIMINATE, CAN_REORDER])
# When vertex robustness is enabled, loads the maximum valid attribute index for
# a given attribute. This is unsigned: the driver ensures that at least one
# vertex is always valid to load, directing loads to a zero sink if necessary.
intrinsic("load_attrib_clamp_agx", src_comp=[1], dest_comp=1,
bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])
# Load a driver-internal system value from a given system value set at a given
# binding within the set. This is used for correctness when lowering things like
# UBOs with merged shaders.

View file

@ -137,8 +137,11 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr,
return load_sysval_indirect(b, 1, 16, stage_table(b), &s->sampler_handle,
intr->src[0].ssa);
case nir_intrinsic_load_vbo_base_agx:
return load_sysval_indirect(b, 1, 64, AGX_SYSVAL_TABLE_ROOT, &u->vbo_base,
intr->src[0].ssa);
return load_sysval_indirect(b, 1, 64, AGX_SYSVAL_TABLE_ROOT,
&u->attrib_base, intr->src[0].ssa);
case nir_intrinsic_load_attrib_clamp_agx:
return load_sysval_indirect(b, 1, 32, AGX_SYSVAL_TABLE_ROOT,
&u->attrib_clamp, intr->src[0].ssa);
case nir_intrinsic_load_blend_const_color_r_float:
return load_sysval_root(b, 1, 32, &u->blend_constant[0]);
case nir_intrinsic_load_blend_const_color_g_float:

View file

@ -53,6 +53,7 @@
#include "agx_device.h"
#include "agx_disk_cache.h"
#include "agx_nir_lower_gs.h"
#include "agx_nir_lower_vbo.h"
#include "agx_tilebuffer.h"
#include "nir_builder.h"
#include "nir_builder_opcodes.h"
@ -1461,7 +1462,8 @@ agx_create_vertex_elements(struct pipe_context *ctx, unsigned count,
{
assert(count <= AGX_MAX_ATTRIBS);
struct agx_attribute *attribs = calloc(sizeof(*attribs), AGX_MAX_ATTRIBS);
struct agx_vertex_elements *so = calloc(1, sizeof(*so));
for (unsigned i = 0; i < count; ++i) {
const struct pipe_vertex_element ve = state[i];
@ -1470,16 +1472,17 @@ agx_create_vertex_elements(struct pipe_context *ctx, unsigned count,
unsigned chan_size = desc->channel[0].size / 8;
assert((ve.src_offset & (chan_size - 1)) == 0);
attribs[i] = (struct agx_attribute){
.buf = ve.vertex_buffer_index,
.src_offset = ve.src_offset,
so->buffers[i] = ve.vertex_buffer_index;
so->src_offsets[i] = ve.src_offset;
so->key[i] = (struct agx_velem_key){
.stride = ve.src_stride,
.format = ve.src_format,
.divisor = ve.instance_divisor,
};
}
return attribs;
return so;
}
static void
@ -1836,6 +1839,22 @@ agx_nir_lower_poly_stipple(nir_shader *s)
return true;
}
static bool
lower_vbo(nir_shader *s, struct agx_velem_key *key)
{
struct agx_attribute out[AGX_MAX_VBUFS];
for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
out[i] = (struct agx_attribute){
.divisor = key[i].divisor,
.stride = key[i].stride,
.format = key[i].format,
};
}
return agx_nir_lower_vbo(s, out);
}
/* Does not take ownership of key. Clones if necessary. */
static struct agx_compiled_shader *
agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
@ -1864,7 +1883,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
if (nir->info.stage == MESA_SHADER_VERTEX) {
struct asahi_vs_shader_key *key = &key_->vs;
NIR_PASS(_, nir, agx_nir_lower_vbo, key->attribs);
NIR_PASS(_, nir, lower_vbo, key->attribs);
NIR_PASS(_, nir, agx_nir_lower_point_size, key->fixed_point_size);
if (should_lower_clip_m1_1(dev, key->clip_halfz)) {
@ -1881,7 +1900,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader);
/* Apply the VS key to the VS before linking it in */
NIR_PASS_V(vs, agx_nir_lower_vbo, key->attribs);
NIR_PASS_V(vs, lower_vbo, key->attribs);
NIR_PASS_V(vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
NIR_PASS_V(vs, agx_nir_lower_sysvals, false);
@ -1903,7 +1922,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader);
/* Apply the VS key to the VS before linking it in */
NIR_PASS(_, vs, agx_nir_lower_vbo, key->attribs);
NIR_PASS(_, vs, lower_vbo, key->attribs);
NIR_PASS(_, vs, agx_nir_lower_ia, &key->ia);
NIR_PASS(_, vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
@ -2245,8 +2264,7 @@ agx_create_shader_state(struct pipe_context *pctx,
switch (so->type) {
case PIPE_SHADER_VERTEX: {
for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) {
key.vs.attribs[i] = (struct agx_attribute){
.buf = i,
key.vs.attribs[i] = (struct agx_velem_key){
.stride = 16,
.format = PIPE_FORMAT_R32G32B32A32_FLOAT,
};
@ -2409,8 +2427,7 @@ agx_update_vs(struct agx_context *ctx)
ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded,
};
memcpy(key.attribs, ctx->attributes,
sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS);
memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));
return agx_update_shader(ctx, &ctx->vs, PIPE_SHADER_VERTEX,
(union asahi_shader_key *)&key);
@ -2441,8 +2458,7 @@ agx_update_tcs(struct agx_context *ctx, const struct pipe_draw_info *info)
.index_size_B = info->index_size,
};
memcpy(key.attribs, ctx->attributes,
sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS);
memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));
static_assert(sizeof(key.input_nir_sha1) ==
sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1),
@ -2491,8 +2507,7 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
.rasterizer_discard = ctx->rast->base.rasterizer_discard,
};
memcpy(key.attribs, ctx->attributes,
sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS);
memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));
static_assert(sizeof(key.input_nir_sha1) ==
sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1),

View file

@ -100,8 +100,13 @@ struct PACKED agx_draw_uniforms {
/* Pointers to the system value tables themselves (for indirection) */
uint64_t tables[AGX_NUM_SYSVAL_TABLES];
/* Vertex buffer object bases, if present */
uint64_t vbo_base[PIPE_MAX_ATTRIBS];
/* Vertex buffer object bases, if present. If vertex robustness is disabled,
* attrib_base maps VBOs directly and attrib_max_index is undefined. If
* vertex robustness is enabled, attrib_base maps attributes and
* attrib_clamp is an inclusive clamp on vertex/divided instance indices.
*/
uint64_t attrib_base[PIPE_MAX_ATTRIBS];
uint32_t attrib_clamp[PIPE_MAX_ATTRIBS];
/* Address of input assembly buffer if geom/tess is used, else 0 */
uint64_t input_assembly;
@ -400,14 +405,31 @@ struct agx_blend {
uint32_t store;
};
/* These parts of the vertex element affect the generated code */
struct agx_velem_key {
uint32_t divisor;
uint16_t stride;
uint8_t format;
uint8_t pad;
};
struct asahi_vs_shader_key {
struct agx_attribute attribs[AGX_MAX_VBUFS];
struct agx_velem_key attribs[AGX_MAX_VBUFS];
bool clip_halfz;
bool fixed_point_size;
uint64_t outputs_flat_shaded;
uint64_t outputs_linear_shaded;
};
struct agx_vertex_elements {
unsigned num_attribs;
struct agx_velem_key key[PIPE_MAX_ATTRIBS];
/* These parts do not affect the generated code so are not in the key */
uint16_t src_offsets[PIPE_MAX_ATTRIBS];
uint16_t buffers[PIPE_MAX_ATTRIBS];
};
struct asahi_fs_shader_key {
struct agx_blend_key blend;
@ -429,7 +451,7 @@ struct asahi_tcs_shader_key {
uint8_t index_size_B;
/* Vertex shader key */
struct agx_attribute attribs[AGX_MAX_VBUFS];
struct agx_velem_key attribs[AGX_MAX_VBUFS];
/* Tessellation control shaders must be linked with a vertex shader. */
uint8_t input_nir_sha1[20];
@ -440,7 +462,7 @@ struct asahi_gs_shader_key {
struct agx_ia_key ia;
/* Vertex shader key */
struct agx_attribute attribs[AGX_MAX_VBUFS];
struct agx_velem_key attribs[AGX_MAX_VBUFS];
/* If true, this GS is run only for its side effects (including XFB) */
bool rasterizer_discard;
@ -561,7 +583,7 @@ struct agx_context {
float default_inner_level[2];
struct agx_stage stage[PIPE_SHADER_TYPES];
struct agx_attribute *attributes;
struct agx_vertex_elements *attributes;
struct agx_rasterizer *rast;
struct agx_zsa *zs;
struct agx_blend *blend;

View file

@ -4,7 +4,9 @@
*/
#include <stdio.h>
#include "asahi/lib/agx_pack.h"
#include "util/format/u_format.h"
#include "agx_state.h"
#include "pool.h"
static uint64_t
agx_const_buffer_ptr(struct agx_batch *batch, struct pipe_constant_buffer *cb)
@ -38,7 +40,13 @@ void
agx_upload_vbos(struct agx_batch *batch)
{
struct agx_context *ctx = batch->ctx;
struct agx_vertex_elements *attribs = ctx->attributes;
uint64_t buffers[PIPE_MAX_ATTRIBS] = {0};
size_t buf_sizes[PIPE_MAX_ATTRIBS] = {0};
/* TODO: To handle null vertex buffers, we use robustness always. Once we
* support soft fault in the kernel, we can optimize this.
*/
u_foreach_bit(vbo, ctx->vb_mask) {
struct pipe_vertex_buffer vb = ctx->vertex_buffers[vbo];
assert(!vb.is_user_buffer);
@ -47,9 +55,51 @@ agx_upload_vbos(struct agx_batch *batch)
struct agx_resource *rsrc = agx_resource(vb.buffer.resource);
agx_batch_reads(batch, rsrc);
batch->uniforms.vbo_base[vbo] = rsrc->bo->ptr.gpu + vb.buffer_offset;
buffers[vbo] = rsrc->bo->ptr.gpu + vb.buffer_offset;
buf_sizes[vbo] = rsrc->layout.size_B - vb.buffer_offset;
}
}
for (unsigned i = 0; i < PIPE_MAX_ATTRIBS; ++i) {
unsigned buffer_size = buf_sizes[attribs->buffers[i]];
/* Determine the maximum vertex/divided instance index. For robustness,
* the index will be clamped to this before reading (if soft fault is
* disabled).
*
* Index i accesses up to (exclusive) offset:
*
* src_offset + (i * stride) + elsize_B
*
* so we require
*
* src_offset + (i * stride) + elsize_B <= size
*
* <==>
*
* i <= floor((size - src_offset - elsize_B) / stride)
*/
unsigned elsize_B = util_format_get_blocksize(attribs->key[i].format);
unsigned subtracted = attribs->src_offsets[i] + elsize_B;
if (buffer_size >= subtracted) {
/* At least one index is valid, determine the max. If this is zero,
* only 1 index is valid.
*/
unsigned max_index =
(buffer_size - subtracted) / attribs->key[i].stride;
batch->uniforms.attrib_base[i] =
buffers[attribs->buffers[i]] + attribs->src_offsets[i];
batch->uniforms.attrib_clamp[i] = max_index;
} else {
batch->uniforms.vbo_base[vbo] = 0;
/* No indices are valid. Direct reads to a single zero. */
uint32_t zeroes[4] = {0};
uint64_t sink = agx_pool_upload_aligned(&batch->pool, &zeroes, 16, 16);
batch->uniforms.attrib_base[i] = sink;
batch->uniforms.attrib_clamp[i] = 0;
}
}
}