diff --git a/src/asahi/lib/agx_nir_lower_vbo.c b/src/asahi/lib/agx_nir_lower_vbo.c index b10359b2ce7..97c3e2fec57 100644 --- a/src/asahi/lib/agx_nir_lower_vbo.c +++ b/src/asahi/lib/agx_nir_lower_vbo.c @@ -171,7 +171,35 @@ pass(struct nir_builder *b, nir_instr *instr, void *data) el = nir_load_vertex_id(b); } - nir_def *base = nir_load_vbo_base_agx(b, nir_imm_int(b, attrib.buf)); + /* VBO bases are per-attribute, otherwise they're per-buffer. This allows + * memory sinks to work properly with robustness, allows folding + * the src_offset into the VBO base to save an add in the shader, and reduces + * the size of the vertex fetch key. That last piece allows reusing a linked + * VS with both separate and interleaved attributes. + */ + nir_def *buf_handle = nir_imm_int(b, index); + + /* Robustness is handled at the ID level */ + nir_def *bounds = nir_load_attrib_clamp_agx(b, buf_handle); + + /* For now, robustness is always applied. This gives GL robustness semantics. + * For robustBufferAccess2, we'll want to check for out-of-bounds access + * (where el > bounds), and replace base with the address of a zero sink. + * With soft fault and a large enough sink, we don't need to clamp the index, + * allowing that robustness behaviour to be implemented in 2 cmpsel + * before the load. That is faster than the 4 cmpsel required after the load, + * and it avoids waiting on the load which should help prolog performance. + * + * TODO: Plumb through soft fault information to skip this. + * + * TODO: Add a knob for robustBufferAccess2 semantics. + */ + bool robust = true; + if (robust) { + el = nir_umin(b, el, bounds); + } + + nir_def *base = nir_load_vbo_base_agx(b, buf_handle); assert((stride % interchange_align) == 0 && "must be aligned"); assert((offset % interchange_align) == 0 && "must be aligned"); diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 4a676f06145..791982b6220 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1839,10 +1839,16 @@ store("agx", [1, 1], [ACCESS, BASE, FORMAT, SIGN_EXTEND]) # Logical complement of load_front_face, mapping to an AGX system value system_value("back_face_agx", 1, bit_sizes=[1, 32]) -# Load the base address of an indexed VBO (for lowering VBOs) +# Load the base address of an indexed vertex attribute (for lowering). intrinsic("load_vbo_base_agx", src_comp=[1], dest_comp=1, bit_sizes=[64], flags=[CAN_ELIMINATE, CAN_REORDER]) +# When vertex robustness is enabled, loads the maximum valid attribute index for +# a given attribute. This is unsigned: the driver ensures that at least one +# vertex is always valid to load, directing loads to a zero sink if necessary. +intrinsic("load_attrib_clamp_agx", src_comp=[1], dest_comp=1, + bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER]) + # Load a driver-internal system value from a given system value set at a given # binding within the set. This is used for correctness when lowering things like # UBOs with merged shaders. diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c index 53800dc6986..c1512f2df5f 100644 --- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c +++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c @@ -137,8 +137,11 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr, return load_sysval_indirect(b, 1, 16, stage_table(b), &s->sampler_handle, intr->src[0].ssa); case nir_intrinsic_load_vbo_base_agx: - return load_sysval_indirect(b, 1, 64, AGX_SYSVAL_TABLE_ROOT, &u->vbo_base, - intr->src[0].ssa); + return load_sysval_indirect(b, 1, 64, AGX_SYSVAL_TABLE_ROOT, + &u->attrib_base, intr->src[0].ssa); + case nir_intrinsic_load_attrib_clamp_agx: + return load_sysval_indirect(b, 1, 32, AGX_SYSVAL_TABLE_ROOT, + &u->attrib_clamp, intr->src[0].ssa); case nir_intrinsic_load_blend_const_color_r_float: return load_sysval_root(b, 1, 32, &u->blend_constant[0]); case nir_intrinsic_load_blend_const_color_g_float: diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 23daa24c2e2..8017e2aaad0 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -53,6 +53,7 @@ #include "agx_device.h" #include "agx_disk_cache.h" #include "agx_nir_lower_gs.h" +#include "agx_nir_lower_vbo.h" #include "agx_tilebuffer.h" #include "nir_builder.h" #include "nir_builder_opcodes.h" @@ -1461,7 +1462,8 @@ agx_create_vertex_elements(struct pipe_context *ctx, unsigned count, { assert(count <= AGX_MAX_ATTRIBS); - struct agx_attribute *attribs = calloc(sizeof(*attribs), AGX_MAX_ATTRIBS); + struct agx_vertex_elements *so = calloc(1, sizeof(*so)); + for (unsigned i = 0; i < count; ++i) { const struct pipe_vertex_element ve = state[i]; @@ -1470,16 +1472,17 @@ agx_create_vertex_elements(struct pipe_context *ctx, unsigned count, unsigned chan_size = desc->channel[0].size / 8; assert((ve.src_offset & (chan_size - 1)) == 0); - attribs[i] = (struct agx_attribute){ - .buf = ve.vertex_buffer_index, - .src_offset = ve.src_offset, + so->buffers[i] = ve.vertex_buffer_index; + so->src_offsets[i] = ve.src_offset; + + so->key[i] = (struct agx_velem_key){ .stride = ve.src_stride, .format = ve.src_format, .divisor = ve.instance_divisor, }; } - return attribs; + return so; } static void @@ -1836,6 +1839,22 @@ agx_nir_lower_poly_stipple(nir_shader *s) return true; } +static bool +lower_vbo(nir_shader *s, struct agx_velem_key *key) +{ + struct agx_attribute out[AGX_MAX_VBUFS]; + + for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) { + out[i] = (struct agx_attribute){ + .divisor = key[i].divisor, + .stride = key[i].stride, + .format = key[i].format, + }; + } + + return agx_nir_lower_vbo(s, out); +} + /* Does not take ownership of key. Clones if necessary. */ static struct agx_compiled_shader * agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, @@ -1864,7 +1883,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, if (nir->info.stage == MESA_SHADER_VERTEX) { struct asahi_vs_shader_key *key = &key_->vs; - NIR_PASS(_, nir, agx_nir_lower_vbo, key->attribs); + NIR_PASS(_, nir, lower_vbo, key->attribs); NIR_PASS(_, nir, agx_nir_lower_point_size, key->fixed_point_size); if (should_lower_clip_m1_1(dev, key->clip_halfz)) { @@ -1881,7 +1900,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader); /* Apply the VS key to the VS before linking it in */ - NIR_PASS_V(vs, agx_nir_lower_vbo, key->attribs); + NIR_PASS_V(vs, lower_vbo, key->attribs); NIR_PASS_V(vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); NIR_PASS_V(vs, agx_nir_lower_sysvals, false); @@ -1903,7 +1922,7 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader); /* Apply the VS key to the VS before linking it in */ - NIR_PASS(_, vs, agx_nir_lower_vbo, key->attribs); + NIR_PASS(_, vs, lower_vbo, key->attribs); NIR_PASS(_, vs, agx_nir_lower_ia, &key->ia); NIR_PASS(_, vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); @@ -2245,8 +2264,7 @@ agx_create_shader_state(struct pipe_context *pctx, switch (so->type) { case PIPE_SHADER_VERTEX: { for (unsigned i = 0; i < AGX_MAX_VBUFS; ++i) { - key.vs.attribs[i] = (struct agx_attribute){ - .buf = i, + key.vs.attribs[i] = (struct agx_velem_key){ .stride = 16, .format = PIPE_FORMAT_R32G32B32A32_FLOAT, }; @@ -2409,8 +2427,7 @@ agx_update_vs(struct agx_context *ctx) ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded, }; - memcpy(key.attribs, ctx->attributes, - sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS); + memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs)); return agx_update_shader(ctx, &ctx->vs, PIPE_SHADER_VERTEX, (union asahi_shader_key *)&key); @@ -2441,8 +2458,7 @@ agx_update_tcs(struct agx_context *ctx, const struct pipe_draw_info *info) .index_size_B = info->index_size, }; - memcpy(key.attribs, ctx->attributes, - sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS); + memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs)); static_assert(sizeof(key.input_nir_sha1) == sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1), @@ -2491,8 +2507,7 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info, .rasterizer_discard = ctx->rast->base.rasterizer_discard, }; - memcpy(key.attribs, ctx->attributes, - sizeof(key.attribs[0]) * AGX_MAX_ATTRIBS); + memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs)); static_assert(sizeof(key.input_nir_sha1) == sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1), diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index e6a229bf836..7295c6f4fb0 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -100,8 +100,13 @@ struct PACKED agx_draw_uniforms { /* Pointers to the system value tables themselves (for indirection) */ uint64_t tables[AGX_NUM_SYSVAL_TABLES]; - /* Vertex buffer object bases, if present */ - uint64_t vbo_base[PIPE_MAX_ATTRIBS]; + /* Vertex buffer object bases, if present. If vertex robustness is disabled, + * attrib_base maps VBOs directly and attrib_max_index is undefined. If + * vertex robustness is enabled, attrib_base maps attributes and + * attrib_clamp is an inclusive clamp on vertex/divided instance indices. + */ + uint64_t attrib_base[PIPE_MAX_ATTRIBS]; + uint32_t attrib_clamp[PIPE_MAX_ATTRIBS]; /* Address of input assembly buffer if geom/tess is used, else 0 */ uint64_t input_assembly; @@ -400,14 +405,31 @@ struct agx_blend { uint32_t store; }; +/* These parts of the vertex element affect the generated code */ +struct agx_velem_key { + uint32_t divisor; + uint16_t stride; + uint8_t format; + uint8_t pad; +}; + struct asahi_vs_shader_key { - struct agx_attribute attribs[AGX_MAX_VBUFS]; + struct agx_velem_key attribs[AGX_MAX_VBUFS]; bool clip_halfz; bool fixed_point_size; uint64_t outputs_flat_shaded; uint64_t outputs_linear_shaded; }; +struct agx_vertex_elements { + unsigned num_attribs; + struct agx_velem_key key[PIPE_MAX_ATTRIBS]; + + /* These parts do not affect the generated code so are not in the key */ + uint16_t src_offsets[PIPE_MAX_ATTRIBS]; + uint16_t buffers[PIPE_MAX_ATTRIBS]; +}; + struct asahi_fs_shader_key { struct agx_blend_key blend; @@ -429,7 +451,7 @@ struct asahi_tcs_shader_key { uint8_t index_size_B; /* Vertex shader key */ - struct agx_attribute attribs[AGX_MAX_VBUFS]; + struct agx_velem_key attribs[AGX_MAX_VBUFS]; /* Tessellation control shaders must be linked with a vertex shader. */ uint8_t input_nir_sha1[20]; @@ -440,7 +462,7 @@ struct asahi_gs_shader_key { struct agx_ia_key ia; /* Vertex shader key */ - struct agx_attribute attribs[AGX_MAX_VBUFS]; + struct agx_velem_key attribs[AGX_MAX_VBUFS]; /* If true, this GS is run only for its side effects (including XFB) */ bool rasterizer_discard; @@ -561,7 +583,7 @@ struct agx_context { float default_inner_level[2]; struct agx_stage stage[PIPE_SHADER_TYPES]; - struct agx_attribute *attributes; + struct agx_vertex_elements *attributes; struct agx_rasterizer *rast; struct agx_zsa *zs; struct agx_blend *blend; diff --git a/src/gallium/drivers/asahi/agx_uniforms.c b/src/gallium/drivers/asahi/agx_uniforms.c index 9707d04b4b4..ee0c7bda668 100644 --- a/src/gallium/drivers/asahi/agx_uniforms.c +++ b/src/gallium/drivers/asahi/agx_uniforms.c @@ -4,7 +4,9 @@ */ #include #include "asahi/lib/agx_pack.h" +#include "util/format/u_format.h" #include "agx_state.h" +#include "pool.h" static uint64_t agx_const_buffer_ptr(struct agx_batch *batch, struct pipe_constant_buffer *cb) @@ -38,7 +40,13 @@ void agx_upload_vbos(struct agx_batch *batch) { struct agx_context *ctx = batch->ctx; + struct agx_vertex_elements *attribs = ctx->attributes; + uint64_t buffers[PIPE_MAX_ATTRIBS] = {0}; + size_t buf_sizes[PIPE_MAX_ATTRIBS] = {0}; + /* TODO: To handle null vertex buffers, we use robustness always. Once we + * support soft fault in the kernel, we can optimize this. + */ u_foreach_bit(vbo, ctx->vb_mask) { struct pipe_vertex_buffer vb = ctx->vertex_buffers[vbo]; assert(!vb.is_user_buffer); @@ -47,9 +55,51 @@ agx_upload_vbos(struct agx_batch *batch) struct agx_resource *rsrc = agx_resource(vb.buffer.resource); agx_batch_reads(batch, rsrc); - batch->uniforms.vbo_base[vbo] = rsrc->bo->ptr.gpu + vb.buffer_offset; + buffers[vbo] = rsrc->bo->ptr.gpu + vb.buffer_offset; + buf_sizes[vbo] = rsrc->layout.size_B - vb.buffer_offset; + } + } + + for (unsigned i = 0; i < PIPE_MAX_ATTRIBS; ++i) { + unsigned buffer_size = buf_sizes[attribs->buffers[i]]; + + /* Determine the maximum vertex/divided instance index. For robustness, + * the index will be clamped to this before reading (if soft fault is + * disabled). + * + * Index i accesses up to (exclusive) offset: + * + * src_offset + (i * stride) + elsize_B + * + * so we require + * + * src_offset + (i * stride) + elsize_B <= size + * + * <==> + * + * i <= floor((size - src_offset - elsize_B) / stride) + */ + unsigned elsize_B = util_format_get_blocksize(attribs->key[i].format); + unsigned subtracted = attribs->src_offsets[i] + elsize_B; + + if (buffer_size >= subtracted) { + /* At least one index is valid, determine the max. If this is zero, + * only 1 index is valid. + */ + unsigned max_index = + (buffer_size - subtracted) / attribs->key[i].stride; + + batch->uniforms.attrib_base[i] = + buffers[attribs->buffers[i]] + attribs->src_offsets[i]; + + batch->uniforms.attrib_clamp[i] = max_index; } else { - batch->uniforms.vbo_base[vbo] = 0; + /* No indices are valid. Direct reads to a single zero. */ + uint32_t zeroes[4] = {0}; + uint64_t sink = agx_pool_upload_aligned(&batch->pool, &zeroes, 16, 16); + + batch->uniforms.attrib_base[i] = sink; + batch->uniforms.attrib_clamp[i] = 0; } } }