radeonsi: move vertex buffer descriptors from IB to memory

This removes the intermediate storage (pm4 state) and generates descriptors
directly in a staging buffer.

It also reduces the number of flushes, because the descriptors no longer
take CS space.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
This commit is contained in:
Marek Olšák 2014-07-09 04:00:53 +02:00
parent 1635ded828
commit 0ed0bf0696
7 changed files with 133 additions and 106 deletions

View file

@ -166,11 +166,13 @@ static void si_update_descriptors(struct si_context *sctx,
}
static void si_emit_shader_pointer(struct si_context *sctx,
struct si_descriptors *desc)
struct r600_atom *atom)
{
struct si_descriptors *desc = (struct si_descriptors*)atom;
struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
uint64_t va = r600_resource_va(sctx->b.b.screen, &desc->buffer->b.b) +
desc->current_context_id * desc->context_size;
desc->current_context_id * desc->context_size +
desc->buffer_offset;
radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
@ -253,7 +255,7 @@ static void si_emit_descriptors(struct si_context *sctx,
desc->current_context_id = new_context_id;
/* Now update the shader userdata pointer. */
si_emit_shader_pointer(sctx, desc);
si_emit_shader_pointer(sctx, &desc->atom);
}
static unsigned si_get_shader_user_data_base(unsigned shader)
@ -330,7 +332,7 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
si_emit_shader_pointer(sctx, &views->desc);
si_emit_shader_pointer(sctx, &views->desc.atom);
}
static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
@ -432,7 +434,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
{
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
si_emit_shader_pointer(sctx, &states->desc);
si_emit_shader_pointer(sctx, &states->desc.atom);
}
void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
@ -533,9 +535,119 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
buffers->desc.buffer, RADEON_USAGE_READWRITE,
RADEON_PRIO_SHADER_DATA);
si_emit_shader_pointer(sctx, &buffers->desc);
si_emit_shader_pointer(sctx, &buffers->desc.atom);
}
/* VERTEX BUFFERS */
static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
{
struct si_descriptors *desc = &sctx->vertex_buffers;
int count = sctx->vertex_elements ? sctx->vertex_elements->count : 0;
int i;
for (i = 0; i < count; i++) {
int vb = sctx->vertex_elements->elements[i].vertex_buffer_index;
if (vb >= sctx->nr_vertex_buffers)
continue;
if (!sctx->vertex_buffer[vb].buffer)
continue;
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
(struct r600_resource*)sctx->vertex_buffer[vb].buffer,
RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
}
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
desc->buffer, RADEON_USAGE_READ,
RADEON_PRIO_SHADER_DATA);
si_emit_shader_pointer(sctx, &desc->atom);
}
void si_update_vertex_buffers(struct si_context *sctx)
{
struct pipe_context *ctx = &sctx->b.b;
struct si_descriptors *desc = &sctx->vertex_buffers;
bool bound[SI_NUM_VERTEX_BUFFERS] = {};
unsigned i, count = sctx->vertex_elements->count;
uint64_t va;
uint32_t *ptr;
if (!count || !sctx->vertex_elements)
return;
/* Vertex buffer descriptors are the only ones which are uploaded
* directly through a staging buffer and don't go through
* the fine-grained upload path.
*/
u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
(struct pipe_resource**)&desc->buffer, (void**)&ptr);
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
desc->buffer, RADEON_USAGE_READ,
RADEON_PRIO_SHADER_DATA);
assert(count <= SI_NUM_VERTEX_BUFFERS);
assert(desc->current_context_id == 0);
for (i = 0; i < count; i++) {
struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
struct pipe_vertex_buffer *vb;
struct r600_resource *rbuffer;
unsigned offset;
uint32_t *desc = &ptr[i*4];
if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers) {
memset(desc, 0, 16);
continue;
}
vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
rbuffer = (struct r600_resource*)vb->buffer;
if (rbuffer == NULL) {
memset(desc, 0, 16);
continue;
}
offset = vb->buffer_offset + ve->src_offset;
va = r600_resource_va(ctx->screen, (void*)rbuffer);
va += offset;
/* Fill in T# buffer resource description */
desc[0] = va & 0xFFFFFFFF;
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
S_008F04_STRIDE(vb->stride);
if (vb->stride)
/* Round up by rounding down and adding 1 */
desc[2] = (vb->buffer->width0 - offset -
sctx->vertex_elements->format_size[i]) /
vb->stride + 1;
else
desc[2] = vb->buffer->width0 - offset;
desc[3] = sctx->vertex_elements->rsrc_word3[i];
if (!bound[ve->vertex_buffer_index]) {
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
(struct r600_resource*)vb->buffer,
RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
bound[ve->vertex_buffer_index] = true;
}
}
desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
desc->atom.dirty = true;
/* Don't flush the const cache. It would have a very negative effect
* on performance (confirmed by testing). New descriptors are always
* uploaded to a fresh new buffer, so I don't think flushing the const
* cache is needed. */
sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
}
/* CONSTANT BUFFERS */
void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
@ -1096,6 +1208,11 @@ void si_init_all_descriptors(struct si_context *sctx)
sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
}
si_init_descriptors(sctx, &sctx->vertex_buffers,
si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
si_emit_shader_pointer);
sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
/* Set pipe_context functions. */
sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@ -1115,6 +1232,7 @@ void si_release_all_descriptors(struct si_context *sctx)
si_release_sampler_views(&sctx->samplers[i].views);
si_release_descriptors(&sctx->samplers[i].states.desc);
}
si_release_descriptors(&sctx->vertex_buffers);
}
void si_all_descriptors_begin_new_cs(struct si_context *sctx)
@ -1127,4 +1245,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
}
si_vertex_buffers_begin_new_cs(sctx);
}

View file

@ -97,6 +97,7 @@ struct si_context {
union {
struct {
/* The order matters. */
struct r600_atom *vertex_buffers;
struct r600_atom *const_buffers[SI_NUM_SHADERS];
struct r600_atom *rw_buffers[SI_NUM_SHADERS];
struct r600_atom *sampler_views[SI_NUM_SHADERS];
@ -124,9 +125,10 @@ struct si_context {
struct si_cs_shader_state cs_shader_state;
/* shader information */
unsigned sprite_coord_enable;
struct si_descriptors vertex_buffers;
struct si_buffer_resources const_buffers[SI_NUM_SHADERS];
struct si_buffer_resources rw_buffers[SI_NUM_SHADERS];
struct si_textures_info samplers[SI_NUM_SHADERS];
struct si_textures_info samplers[SI_NUM_SHADERS];
struct r600_resource *border_color_table;
unsigned border_color_offset;

View file

@ -103,37 +103,6 @@ void si_pm4_add_bo(struct si_pm4_state *state,
state->bo_priority[idx] = priority;
}
void si_pm4_sh_data_begin(struct si_pm4_state *state)
{
si_pm4_cmd_begin(state, PKT3_NOP);
}
void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw)
{
si_pm4_cmd_add(state, dw);
}
void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx)
{
unsigned offs = state->last_pm4 + 1;
unsigned reg = base + idx * 4;
/* Bail if no data was added */
if (state->ndw == offs) {
state->ndw--;
return;
}
si_pm4_cmd_end(state, false);
si_pm4_cmd_begin(state, PKT3_SET_SH_REG_OFFSET);
si_pm4_cmd_add(state, (reg - SI_SH_REG_OFFSET) >> 2);
state->relocs[state->nrelocs++] = state->ndw;
si_pm4_cmd_add(state, offs << 2);
si_pm4_cmd_add(state, 0);
si_pm4_cmd_end(state, false);
}
void si_pm4_inval_shader_cache(struct si_pm4_state *state)
{
state->cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);

View file

@ -76,10 +76,6 @@ void si_pm4_add_bo(struct si_pm4_state *state,
enum radeon_bo_usage usage,
enum radeon_bo_priority priority);
void si_pm4_sh_data_begin(struct si_pm4_state *state);
void si_pm4_sh_data_add(struct si_pm4_state *state, uint32_t dw);
void si_pm4_sh_data_end(struct si_pm4_state *state, unsigned base, unsigned idx);
void si_pm4_inval_shader_cache(struct si_pm4_state *state);
void si_pm4_inval_texture_cache(struct si_pm4_state *state);

View file

@ -2800,6 +2800,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) |
S_008F0C_NUM_FORMAT(num_format) |
S_008F0C_DATA_FORMAT(data_format);
v->format_size[i] = desc->block.bits / 8;
}
memcpy(v->elements, elements, sizeof(struct pipe_vertex_element) * count);

View file

@ -72,6 +72,7 @@ struct si_vertex_element
{
unsigned count;
uint32_t rsrc_word3[PIPE_MAX_ATTRIBS];
uint32_t format_size[PIPE_MAX_ATTRIBS];
struct pipe_vertex_element elements[PIPE_MAX_ATTRIBS];
};
@ -97,7 +98,6 @@ union si_state {
struct si_pm4_state *vs;
struct si_pm4_state *ps;
struct si_pm4_state *spi;
struct si_pm4_state *vertex_buffers;
struct si_pm4_state *draw_info;
struct si_pm4_state *draw;
} named;
@ -147,6 +147,7 @@ struct si_descriptors {
/* The buffer where resource descriptors are stored. */
struct r600_resource *buffer;
unsigned buffer_offset;
/* The i-th bit is set if that element is dirty (changed but not emitted). */
unsigned dirty_mask;
@ -221,6 +222,7 @@ struct si_buffer_resources {
/* si_descriptors.c */
void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
unsigned start, unsigned count, void **states);
void si_update_vertex_buffers(struct si_context *sctx);
void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
struct pipe_constant_buffer *input,
unsigned stride, unsigned num_records,

View file

@ -658,68 +658,6 @@ static void si_update_derived_state(struct si_context *sctx)
}
}
static void si_vertex_buffer_update(struct si_context *sctx)
{
struct pipe_context *ctx = &sctx->b.b;
struct si_pm4_state *pm4 = si_pm4_alloc_state(sctx);
bool bound[PIPE_MAX_ATTRIBS] = {};
unsigned i, count;
uint64_t va;
sctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
count = sctx->vertex_elements->count;
assert(count <= 256 / 4);
si_pm4_sh_data_begin(pm4);
for (i = 0 ; i < count; i++) {
struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
struct pipe_vertex_buffer *vb;
struct r600_resource *rbuffer;
unsigned offset;
if (ve->vertex_buffer_index >= sctx->nr_vertex_buffers)
continue;
vb = &sctx->vertex_buffer[ve->vertex_buffer_index];
rbuffer = (struct r600_resource*)vb->buffer;
if (rbuffer == NULL)
continue;
offset = 0;
offset += vb->buffer_offset;
offset += ve->src_offset;
va = r600_resource_va(ctx->screen, (void*)rbuffer);
va += offset;
/* Fill in T# buffer resource description */
si_pm4_sh_data_add(pm4, va & 0xFFFFFFFF);
si_pm4_sh_data_add(pm4, (S_008F04_BASE_ADDRESS_HI(va >> 32) |
S_008F04_STRIDE(vb->stride)));
if (vb->stride)
/* Round up by rounding down and adding 1 */
si_pm4_sh_data_add(pm4,
(vb->buffer->width0 - offset -
util_format_get_blocksize(ve->src_format)) /
vb->stride + 1);
else
si_pm4_sh_data_add(pm4, vb->buffer->width0 - offset);
si_pm4_sh_data_add(pm4, sctx->vertex_elements->rsrc_word3[i]);
if (!bound[ve->vertex_buffer_index]) {
si_pm4_add_bo(pm4, rbuffer, RADEON_USAGE_READ,
RADEON_PRIO_SHADER_BUFFER_RO);
bound[ve->vertex_buffer_index] = true;
}
}
si_pm4_sh_data_end(pm4, sctx->gs_shader ?
R_00B330_SPI_SHADER_USER_DATA_ES_0 :
R_00B130_SPI_SHADER_USER_DATA_VS_0,
SI_SGPR_VERTEX_BUFFER);
si_pm4_set_state(sctx, vertex_buffers, pm4);
}
static void si_state_draw(struct si_context *sctx,
const struct pipe_draw_info *info,
const struct pipe_index_buffer *ib)
@ -954,7 +892,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
return;
si_update_derived_state(sctx);
si_vertex_buffer_update(sctx);
si_update_vertex_buffers(sctx);
if (info->indexed) {
/* Initialize the index buffer struct. */