mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-04-22 16:10:36 +02:00
freedreno/ir3: add transform-feedback support
Signed-off-by: Rob Clark <robclark@freedesktop.org>
This commit is contained in:
parent
96d4db683f
commit
98a4b111fb
4 changed files with 230 additions and 4 deletions
|
|
@ -227,9 +227,20 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
|
|||
|
||||
/* Stream output. */
|
||||
case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
|
||||
if (is_a3xx(screen) || is_a4xx(screen))
|
||||
return PIPE_MAX_SO_BUFFERS;
|
||||
return 0;
|
||||
case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
|
||||
if (is_a3xx(screen) || is_a4xx(screen))
|
||||
return 1;
|
||||
return 0;
|
||||
case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
|
||||
if (is_a3xx(screen) || is_a4xx(screen))
|
||||
return 16; /* should only be shader out limit? */
|
||||
return 0;
|
||||
case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
|
||||
if (is_a3xx(screen) || is_a4xx(screen))
|
||||
return 16; /* should only be shader out limit? */
|
||||
return 0;
|
||||
|
||||
/* Geometry shader output, unsupported. */
|
||||
|
|
|
|||
|
|
@ -263,6 +263,7 @@ compile_init(struct ir3_compiler *compiler,
|
|||
* 4 * vec4 - UBO addresses
|
||||
* if (vertex shader) {
|
||||
* 1 * vec4 - driver params (IR3_DP_*)
|
||||
* 1 * vec4 - stream-out addresses
|
||||
* }
|
||||
*
|
||||
* TODO this could be made more dynamic, to at least skip sections
|
||||
|
|
@ -275,6 +276,8 @@ compile_init(struct ir3_compiler *compiler,
|
|||
if (so->type == SHADER_VERTEX) {
|
||||
/* one (vec4) slot for driver params (see ir3_driver_param): */
|
||||
so->first_immediate++;
|
||||
/* one (vec4) slot for stream-output base addresses: */
|
||||
so->first_immediate++;
|
||||
}
|
||||
|
||||
return ctx;
|
||||
|
|
@ -1971,6 +1974,115 @@ emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
|
|||
}
|
||||
}
|
||||
|
||||
/* emit stream-out code. At this point, the current block is the original
|
||||
* (nir) end block, and nir ensures that all flow control paths terminate
|
||||
* into the end block. We re-purpose the original end block to generate
|
||||
* the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional
|
||||
* block holding stream-out write instructions, followed by the new end
|
||||
* block:
|
||||
*
|
||||
* blockOrigEnd {
|
||||
* p0.x = (vtxcnt < maxvtxcnt)
|
||||
* // succs: blockStreamOut, blockNewEnd
|
||||
* }
|
||||
* blockStreamOut {
|
||||
* ... stream-out instructions ...
|
||||
* // succs: blockNewEnd
|
||||
* }
|
||||
* blockNewEnd {
|
||||
* }
|
||||
*/
|
||||
static void
|
||||
emit_stream_out(struct ir3_compile *ctx)
|
||||
{
|
||||
struct ir3_shader_variant *v = ctx->so;
|
||||
struct ir3 *ir = ctx->ir;
|
||||
struct pipe_stream_output_info *strmout =
|
||||
&ctx->so->shader->stream_output;
|
||||
struct ir3_block *orig_end_block, *stream_out_block, *new_end_block;
|
||||
struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond;
|
||||
struct ir3_instruction *bases[PIPE_MAX_SO_BUFFERS];
|
||||
|
||||
/* create vtxcnt input in input block at top of shader,
|
||||
* so that it is seen as live over the entire duration
|
||||
* of the shader:
|
||||
*/
|
||||
vtxcnt = create_input(ctx->in_block, 0);
|
||||
add_sysval_input(ctx, IR3_SEMANTIC_VTXCNT, vtxcnt);
|
||||
|
||||
maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
|
||||
|
||||
/* at this point, we are at the original 'end' block,
|
||||
* re-purpose this block to stream-out condition, then
|
||||
* append stream-out block and new-end block
|
||||
*/
|
||||
orig_end_block = ctx->block;
|
||||
|
||||
stream_out_block = ir3_block_create(ir);
|
||||
list_addtail(&stream_out_block->node, &ir->block_list);
|
||||
|
||||
new_end_block = ir3_block_create(ir);
|
||||
list_addtail(&new_end_block->node, &ir->block_list);
|
||||
|
||||
orig_end_block->successors[0] = stream_out_block;
|
||||
orig_end_block->successors[1] = new_end_block;
|
||||
stream_out_block->successors[0] = new_end_block;
|
||||
|
||||
/* setup 'if (vtxcnt < maxvtxcnt)' condition: */
|
||||
cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0);
|
||||
cond->regs[0]->num = regid(REG_P0, 0);
|
||||
cond->cat2.condition = IR3_COND_LT;
|
||||
|
||||
/* condition goes on previous block to the conditional,
|
||||
* since it is used to pick which of the two successor
|
||||
* paths to take:
|
||||
*/
|
||||
orig_end_block->condition = cond;
|
||||
|
||||
/* switch to stream_out_block to generate the stream-out
|
||||
* instructions:
|
||||
*/
|
||||
ctx->block = stream_out_block;
|
||||
|
||||
/* Calculate base addresses based on vtxcnt. Instructions
|
||||
* generated for bases not used in following loop will be
|
||||
* stripped out in the backend.
|
||||
*/
|
||||
for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
|
||||
unsigned stride = strmout->stride[i];
|
||||
struct ir3_instruction *base, *off;
|
||||
|
||||
base = create_uniform(ctx, regid(v->first_driver_param + 5, i));
|
||||
|
||||
/* 24-bit should be enough: */
|
||||
off = ir3_MUL_U(ctx->block, vtxcnt, 0,
|
||||
create_immed(ctx->block, stride * 4), 0);
|
||||
|
||||
bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0);
|
||||
}
|
||||
|
||||
/* Generate the per-output store instructions: */
|
||||
for (unsigned i = 0; i < strmout->num_outputs; i++) {
|
||||
for (unsigned j = 0; j < strmout->output[i].num_components; j++) {
|
||||
unsigned c = j + strmout->output[i].start_component;
|
||||
struct ir3_instruction *base, *out, *stg;
|
||||
|
||||
base = bases[strmout->output[i].output_buffer];
|
||||
out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)];
|
||||
|
||||
stg = ir3_STG(ctx->block, base, 0, out, 0,
|
||||
create_immed(ctx->block, 1), 0);
|
||||
stg->cat6.type = TYPE_U32;
|
||||
stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4;
|
||||
|
||||
array_insert(ctx->ir->keeps, stg);
|
||||
}
|
||||
}
|
||||
|
||||
/* and finally switch to the new_end_block: */
|
||||
ctx->block = new_end_block;
|
||||
}
|
||||
|
||||
static void
|
||||
emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
|
||||
{
|
||||
|
|
@ -1981,6 +2093,24 @@ emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
|
|||
* into which we emit the 'end' instruction.
|
||||
*/
|
||||
compile_assert(ctx, list_empty(&ctx->block->instr_list));
|
||||
|
||||
/* If stream-out (aka transform-feedback) enabled, emit the
|
||||
* stream-out instructions, followed by a new empty block (into
|
||||
* which the 'end' instruction lands).
|
||||
*
|
||||
* NOTE: it is done in this order, rather than inserting before
|
||||
* we emit end_block, because NIR guarantees that all blocks
|
||||
* flow into end_block, and that end_block has no successors.
|
||||
* So by re-purposing end_block as the first block of stream-
|
||||
* out, we guarantee that all exit paths flow into the stream-
|
||||
* out instructions.
|
||||
*/
|
||||
if ((ctx->so->shader->stream_output.num_outputs > 0) &&
|
||||
!ctx->so->key.binning_pass) {
|
||||
debug_assert(ctx->so->type == SHADER_VERTEX);
|
||||
emit_stream_out(ctx);
|
||||
}
|
||||
|
||||
ir3_END(ctx->block);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -466,10 +466,10 @@ static void
|
|||
emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
|
||||
struct fd_constbuf_stateobj *constbuf)
|
||||
{
|
||||
if (v->constlen > v->first_driver_param) {
|
||||
uint32_t offset = v->first_driver_param; /* UBOs after user consts */
|
||||
if (v->constlen > offset) {
|
||||
struct fd_context *ctx = fd_context(v->shader->pctx);
|
||||
uint32_t offset = v->first_driver_param; /* UBOs after user consts */
|
||||
uint32_t params = MIN2(4, v->constlen - v->first_driver_param) * 4;
|
||||
uint32_t params = MIN2(4, v->constlen - offset) * 4;
|
||||
uint32_t offsets[params];
|
||||
struct fd_bo *bos[params];
|
||||
|
||||
|
|
@ -515,6 +515,83 @@ emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
|
|||
}
|
||||
}
|
||||
|
||||
/* emit stream-out buffers: */
|
||||
static void
|
||||
emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
|
||||
{
|
||||
uint32_t offset = v->first_driver_param + 5; /* streamout addresses after driver-params*/
|
||||
if (v->constlen > offset) {
|
||||
struct fd_context *ctx = fd_context(v->shader->pctx);
|
||||
struct fd_streamout_stateobj *so = &ctx->streamout;
|
||||
struct pipe_stream_output_info *info = &v->shader->stream_output;
|
||||
uint32_t params = 4;
|
||||
uint32_t offsets[params];
|
||||
struct fd_bo *bos[params];
|
||||
|
||||
for (uint32_t i = 0; i < params; i++) {
|
||||
struct pipe_stream_output_target *target = so->targets[i];
|
||||
|
||||
if (target) {
|
||||
offsets[i] = (so->offsets[i] * info->stride[i] * 4) +
|
||||
target->buffer_offset;
|
||||
bos[i] = fd_resource(target->buffer)->bo;
|
||||
} else {
|
||||
offsets[i] = 0;
|
||||
bos[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
fd_wfi(ctx, ring);
|
||||
ctx->emit_const_bo(ring, v->type, true, offset * 4, params, bos, offsets);
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
max_tf_vtx(struct ir3_shader_variant *v)
|
||||
{
|
||||
struct fd_context *ctx = fd_context(v->shader->pctx);
|
||||
struct fd_streamout_stateobj *so = &ctx->streamout;
|
||||
struct pipe_stream_output_info *info = &v->shader->stream_output;
|
||||
uint32_t maxvtxcnt = 0x7fffffff;
|
||||
|
||||
if (v->key.binning_pass)
|
||||
return 0;
|
||||
if (v->shader->stream_output.num_outputs == 0)
|
||||
return 0;
|
||||
if (so->num_targets == 0)
|
||||
return 0;
|
||||
|
||||
/* offset to write to is:
|
||||
*
|
||||
* total_vtxcnt = vtxcnt + offsets[i]
|
||||
* offset = total_vtxcnt * stride[i]
|
||||
*
|
||||
* offset = vtxcnt * stride[i] ; calculated in shader
|
||||
* + offsets[i] * stride[i] ; calculated at emit_tfbos()
|
||||
*
|
||||
* assuming for each vtx, each target buffer will have data written
|
||||
* up to 'offset + stride[i]', that leaves maxvtxcnt as:
|
||||
*
|
||||
* buffer_size = (maxvtxcnt * stride[i]) + stride[i]
|
||||
* maxvtxcnt = (buffer_size - stride[i]) / stride[i]
|
||||
*
|
||||
* but shader is actually doing a less-than (rather than less-than-
|
||||
* equal) check, so we can drop the -stride[i].
|
||||
*
|
||||
* TODO is assumption about `offset + stride[i]` legit?
|
||||
*/
|
||||
for (unsigned i = 0; i < so->num_targets; i++) {
|
||||
struct pipe_stream_output_target *target = so->targets[i];
|
||||
unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
|
||||
if (target) {
|
||||
uint32_t max = target->buffer_size / stride;
|
||||
maxvtxcnt = MIN2(maxvtxcnt, max);
|
||||
}
|
||||
}
|
||||
|
||||
return maxvtxcnt;
|
||||
}
|
||||
|
||||
void
|
||||
ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
|
||||
const struct pipe_draw_info *info, uint32_t dirty)
|
||||
|
|
@ -548,12 +625,19 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
|
|||
uint32_t offset = v->first_driver_param + 4; /* driver params after UBOs */
|
||||
if (v->constlen >= offset) {
|
||||
uint32_t vertex_params[4] = {
|
||||
[IR3_DP_VTXID_BASE] = info->indexed ? info->index_bias : info->start,
|
||||
[IR3_DP_VTXID_BASE] = info->indexed ?
|
||||
info->index_bias : info->start,
|
||||
[IR3_DP_VTXCNT_MAX] = max_tf_vtx(v),
|
||||
};
|
||||
|
||||
fd_wfi(ctx, ring);
|
||||
ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
|
||||
ARRAY_SIZE(vertex_params), vertex_params, NULL);
|
||||
|
||||
/* if needed, emit stream-out buffer addresses: */
|
||||
if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
|
||||
emit_tfbos(v, ring);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@
|
|||
/* driver param indices: */
|
||||
enum ir3_driver_param {
|
||||
IR3_DP_VTXID_BASE = 0,
|
||||
IR3_DP_VTXCNT_MAX = 1,
|
||||
};
|
||||
|
||||
/* internal semantic used for passing vtxcnt to vertex shader to
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue