mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-01 08:08:06 +02:00
radeonsi: completely rework updating descriptors without CP DMA
The patch has a better explanation. Just a summary here: - The CPU always uploads a whole descriptor array to previously-unused memory. - CP DMA isn't used. - No caches need to be flushed. - All descriptors are always up-to-date in memory even after a hang, because CP DMA doesn't serve as a middle man to update them. This should bring: - better hang recovery (descriptors are always up-to-date) - better GPU performance (no KCACHE and TC flushes) - worse CPU performance for partial updates (only whole arrays are uploaded) - less used IB space (no CP_DMA and WRITE_DATA packets) - simpler code - hopefully, some of the corruption issues with SI cards will go away. If not, we'll know the issue is not here. Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
This commit is contained in:
parent
781dc7c0e1
commit
b0528118df
4 changed files with 129 additions and 272 deletions
|
|
@ -24,14 +24,23 @@
|
|||
* Marek Olšák <marek.olsak@amd.com>
|
||||
*/
|
||||
|
||||
/* Resource binding slots and sampler states (each described with 8 or 4 dwords)
|
||||
* live in memory on SI.
|
||||
/* Resource binding slots and sampler states (each described with 8 or
|
||||
* 4 dwords) are stored in lists in memory which is accessed by shaders
|
||||
* using scalar load instructions.
|
||||
*
|
||||
* This file is responsible for managing lists of resources and sampler states
|
||||
* in memory and binding them, which means updating those structures in memory.
|
||||
* This file is responsible for managing such lists. It keeps a copy of all
|
||||
* descriptors in CPU memory and re-uploads a whole list if some slots have
|
||||
* been changed.
|
||||
*
|
||||
* There is also code for updating shader pointers to resources and sampler
|
||||
* states. CP DMA functions are here too.
|
||||
* This code is also reponsible for updating shader pointers to those lists.
|
||||
*
|
||||
* Note that CP DMA can't be used for updating the lists, because a GPU hang
|
||||
* could leave the list in a mid-IB state and the next IB would get wrong
|
||||
* descriptors and the whole context would be unusable at that point.
|
||||
* (Note: The register shadowing can't be used due to the same reason)
|
||||
*
|
||||
* Also, uploading descriptors to newly allocated memory doesn't require
|
||||
* a KCACHE flush.
|
||||
*/
|
||||
|
||||
#include "radeon/r600_cs.h"
|
||||
|
|
@ -42,7 +51,6 @@
|
|||
#include "util/u_memory.h"
|
||||
#include "util/u_upload_mgr.h"
|
||||
|
||||
#define SI_NUM_CONTEXTS 16
|
||||
|
||||
/* NULL image and buffer descriptor.
|
||||
*
|
||||
|
|
@ -139,159 +147,62 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
|
|||
}
|
||||
}
|
||||
|
||||
static void si_init_descriptors(struct si_context *sctx,
|
||||
struct si_descriptors *desc,
|
||||
static void si_init_descriptors(struct si_descriptors *desc,
|
||||
unsigned shader_userdata_index,
|
||||
unsigned element_dw_size,
|
||||
unsigned num_elements,
|
||||
void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
|
||||
unsigned num_elements)
|
||||
{
|
||||
assert(num_elements <= sizeof(desc->enabled_mask)*8);
|
||||
assert(num_elements <= sizeof(desc->dirty_mask)*8);
|
||||
int i;
|
||||
|
||||
desc->atom.emit = (void*)emit_func;
|
||||
desc->shader_userdata_offset = shader_userdata_index * 4;
|
||||
assert(num_elements <= sizeof(desc->enabled_mask)*8);
|
||||
|
||||
desc->list = CALLOC(num_elements, element_dw_size * 4);
|
||||
desc->element_dw_size = element_dw_size;
|
||||
desc->num_elements = num_elements;
|
||||
desc->context_size = num_elements * element_dw_size * 4;
|
||||
desc->list_dirty = true; /* upload the list before the next draw */
|
||||
desc->shader_userdata_offset = shader_userdata_index * 4;
|
||||
|
||||
desc->buffer = (struct r600_resource*)
|
||||
pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
|
||||
PIPE_USAGE_DEFAULT,
|
||||
SI_NUM_CONTEXTS * desc->context_size);
|
||||
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
|
||||
RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
|
||||
|
||||
/* We don't check for CS space here, because this should be called
|
||||
* only once at context initialization. */
|
||||
si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
|
||||
desc->buffer->b.b.width0, 0,
|
||||
R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
|
||||
/* Initialize the array to NULL descriptors if the element size is 8. */
|
||||
if (element_dw_size == 8)
|
||||
for (i = 0; i < num_elements; i++)
|
||||
memcpy(desc->list + i*element_dw_size, null_descriptor,
|
||||
sizeof(null_descriptor));
|
||||
}
|
||||
|
||||
static void si_release_descriptors(struct si_descriptors *desc)
|
||||
{
|
||||
pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
|
||||
FREE(desc->list);
|
||||
}
|
||||
|
||||
static void si_update_descriptors(struct si_context *sctx,
|
||||
static bool si_upload_descriptors(struct si_context *sctx,
|
||||
struct si_descriptors *desc)
|
||||
{
|
||||
if (desc->dirty_mask) {
|
||||
desc->atom.num_dw =
|
||||
7 + /* copy */
|
||||
(4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask); /* update */
|
||||
unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
|
||||
void *ptr;
|
||||
|
||||
desc->atom.dirty = true;
|
||||
desc->pointer_dirty = true;
|
||||
sctx->shader_userdata.atom.dirty = true;
|
||||
if (!desc->list_dirty)
|
||||
return true;
|
||||
|
||||
/* TODO: Investigate if these flushes can be removed after
|
||||
* adding CE support. */
|
||||
u_upload_alloc(sctx->b.uploader, 0, list_size,
|
||||
&desc->buffer_offset,
|
||||
(struct pipe_resource**)&desc->buffer, &ptr);
|
||||
if (!desc->buffer)
|
||||
return false; /* skip the draw call */
|
||||
|
||||
/* The descriptors are read with the K cache. */
|
||||
sctx->b.flags |= SI_CONTEXT_INV_KCACHE;
|
||||
util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
|
||||
|
||||
/* Since SI uses uncached CP DMA to update descriptors,
|
||||
* we have to flush TC L2, which is used to fetch constants
|
||||
* along with KCACHE. */
|
||||
if (sctx->b.chip_class == SI)
|
||||
sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
|
||||
} else {
|
||||
desc->atom.dirty = false;
|
||||
}
|
||||
}
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
|
||||
RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
|
||||
|
||||
static void si_emit_descriptors(struct si_context *sctx,
|
||||
struct si_descriptors *desc,
|
||||
uint32_t **descriptors)
|
||||
{
|
||||
struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
|
||||
uint64_t va_base;
|
||||
int packet_start = 0;
|
||||
int packet_size = 0;
|
||||
int last_index = desc->num_elements; /* point to a non-existing element */
|
||||
uint64_t dirty_mask = desc->dirty_mask;
|
||||
unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
|
||||
|
||||
assert(dirty_mask);
|
||||
|
||||
va_base = desc->buffer->gpu_address;
|
||||
|
||||
/* Copy the descriptors to a new context slot. */
|
||||
si_emit_cp_dma_copy_buffer(sctx,
|
||||
va_base + new_context_id * desc->context_size,
|
||||
va_base + desc->current_context_id * desc->context_size,
|
||||
desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
|
||||
|
||||
va_base += new_context_id * desc->context_size;
|
||||
|
||||
/* Update the descriptors.
|
||||
* Updates of consecutive descriptors are merged to one WRITE_DATA packet.
|
||||
*
|
||||
* XXX When unbinding lots of resources, consider clearing the memory
|
||||
* with CP DMA instead of emitting zeros.
|
||||
*/
|
||||
while (dirty_mask) {
|
||||
int i = u_bit_scan64(&dirty_mask);
|
||||
|
||||
assert(i < desc->num_elements);
|
||||
|
||||
if (last_index+1 == i && packet_size) {
|
||||
/* Append new data at the end of the last packet. */
|
||||
packet_size += desc->element_dw_size;
|
||||
cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
|
||||
} else {
|
||||
/* Start a new packet. */
|
||||
uint64_t va = va_base + i * desc->element_dw_size * 4;
|
||||
|
||||
packet_start = cs->cdw;
|
||||
packet_size = 2 + desc->element_dw_size;
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
|
||||
radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
|
||||
PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
|
||||
PKT3_WRITE_DATA_DST_SEL_TC_L2) |
|
||||
PKT3_WRITE_DATA_WR_CONFIRM |
|
||||
PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
|
||||
radeon_emit(cs, va & 0xFFFFFFFFUL);
|
||||
radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
|
||||
}
|
||||
|
||||
radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
|
||||
|
||||
last_index = i;
|
||||
}
|
||||
|
||||
desc->dirty_mask = 0;
|
||||
desc->current_context_id = new_context_id;
|
||||
desc->list_dirty = false;
|
||||
desc->pointer_dirty = true;
|
||||
sctx->shader_userdata.atom.dirty = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* SAMPLER VIEWS */
|
||||
|
||||
static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom)
|
||||
{
|
||||
struct si_sampler_views *views = (struct si_sampler_views*)atom;
|
||||
|
||||
si_emit_descriptors(sctx, &views->desc, views->desc_data);
|
||||
}
|
||||
|
||||
static void si_init_sampler_views(struct si_context *sctx,
|
||||
struct si_sampler_views *views)
|
||||
{
|
||||
int i;
|
||||
|
||||
si_init_descriptors(sctx, &views->desc, SI_SGPR_RESOURCE,
|
||||
8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
|
||||
|
||||
for (i = 0; i < views->desc.num_elements; i++) {
|
||||
views->desc_data[i] = null_descriptor;
|
||||
views->desc.dirty_mask |= 1llu << i;
|
||||
}
|
||||
si_update_descriptors(sctx, &views->desc);
|
||||
}
|
||||
|
||||
static void si_release_sampler_views(struct si_sampler_views *views)
|
||||
{
|
||||
int i;
|
||||
|
|
@ -332,6 +243,8 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
|
|||
si_get_resource_ro_priority(rview->resource));
|
||||
}
|
||||
|
||||
if (!views->desc.buffer)
|
||||
return;
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
|
||||
RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
|
||||
}
|
||||
|
|
@ -354,17 +267,16 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
|
|||
rview->resource, RADEON_USAGE_READ,
|
||||
si_get_resource_ro_priority(rview->resource));
|
||||
|
||||
|
||||
pipe_sampler_view_reference(&views->views[slot], view);
|
||||
views->desc_data[slot] = view_desc;
|
||||
memcpy(views->desc.list + slot*8, view_desc, 8*4);
|
||||
views->desc.enabled_mask |= 1llu << slot;
|
||||
} else {
|
||||
pipe_sampler_view_reference(&views->views[slot], NULL);
|
||||
views->desc_data[slot] = null_descriptor;
|
||||
memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
|
||||
views->desc.enabled_mask &= ~(1llu << slot);
|
||||
}
|
||||
|
||||
views->desc.dirty_mask |= 1llu << slot;
|
||||
views->desc.list_dirty = true;
|
||||
}
|
||||
|
||||
static void si_set_sampler_views(struct pipe_context *ctx,
|
||||
|
|
@ -423,22 +335,15 @@ static void si_set_sampler_views(struct pipe_context *ctx,
|
|||
NULL, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
si_update_descriptors(sctx, &samplers->views.desc);
|
||||
}
|
||||
|
||||
/* SAMPLER STATES */
|
||||
|
||||
static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom)
|
||||
{
|
||||
struct si_sampler_states *states = (struct si_sampler_states*)atom;
|
||||
|
||||
si_emit_descriptors(sctx, &states->desc, states->desc_data);
|
||||
}
|
||||
|
||||
static void si_sampler_states_begin_new_cs(struct si_context *sctx,
|
||||
struct si_sampler_states *states)
|
||||
{
|
||||
if (!states->desc.buffer)
|
||||
return;
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
|
||||
RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
|
||||
}
|
||||
|
|
@ -460,64 +365,39 @@ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
|
|||
for (i = 0; i < count; i++) {
|
||||
unsigned slot = start + i;
|
||||
|
||||
if (!sstates[i]) {
|
||||
samplers->desc.dirty_mask &= ~(1llu << slot);
|
||||
if (!sstates[i])
|
||||
continue;
|
||||
}
|
||||
|
||||
samplers->desc_data[slot] = sstates[i]->val;
|
||||
samplers->desc.dirty_mask |= 1llu << slot;
|
||||
memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4);
|
||||
samplers->desc.list_dirty = true;
|
||||
}
|
||||
|
||||
si_update_descriptors(sctx, &samplers->desc);
|
||||
}
|
||||
|
||||
/* BUFFER RESOURCES */
|
||||
|
||||
static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom)
|
||||
{
|
||||
struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom;
|
||||
|
||||
si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
|
||||
}
|
||||
|
||||
static void si_init_buffer_resources(struct si_context *sctx,
|
||||
struct si_buffer_resources *buffers,
|
||||
static void si_init_buffer_resources(struct si_buffer_resources *buffers,
|
||||
unsigned num_buffers,
|
||||
unsigned shader_userdata_index,
|
||||
enum radeon_bo_usage shader_usage,
|
||||
enum radeon_bo_priority priority)
|
||||
{
|
||||
int i;
|
||||
|
||||
buffers->num_buffers = num_buffers;
|
||||
buffers->shader_usage = shader_usage;
|
||||
buffers->priority = priority;
|
||||
buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
|
||||
buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
|
||||
|
||||
/* si_emit_descriptors only accepts an array of arrays.
|
||||
* This adds such an array. */
|
||||
buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
|
||||
for (i = 0; i < num_buffers; i++) {
|
||||
buffers->desc_data[i] = &buffers->desc_storage[i*4];
|
||||
}
|
||||
|
||||
si_init_descriptors(sctx, &buffers->desc, shader_userdata_index, 4,
|
||||
num_buffers, si_emit_buffer_resources);
|
||||
si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
|
||||
num_buffers);
|
||||
}
|
||||
|
||||
static void si_release_buffer_resources(struct si_buffer_resources *buffers)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < buffers->num_buffers; i++) {
|
||||
for (i = 0; i < buffers->desc.num_elements; i++) {
|
||||
pipe_resource_reference(&buffers->buffers[i], NULL);
|
||||
}
|
||||
|
||||
FREE(buffers->buffers);
|
||||
FREE(buffers->desc_storage);
|
||||
FREE(buffers->desc_data);
|
||||
si_release_descriptors(&buffers->desc);
|
||||
}
|
||||
|
||||
|
|
@ -535,6 +415,8 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
|
|||
buffers->shader_usage, buffers->priority);
|
||||
}
|
||||
|
||||
if (!buffers->desc.buffer)
|
||||
return;
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
|
||||
buffers->desc.buffer, RADEON_USAGE_READWRITE,
|
||||
RADEON_PRIO_SHADER_DATA);
|
||||
|
|
@ -560,12 +442,15 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
|
|||
(struct r600_resource*)sctx->vertex_buffer[vb].buffer,
|
||||
RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
|
||||
}
|
||||
|
||||
if (!desc->buffer)
|
||||
return;
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
|
||||
desc->buffer, RADEON_USAGE_READ,
|
||||
RADEON_PRIO_SHADER_DATA);
|
||||
}
|
||||
|
||||
void si_update_vertex_buffers(struct si_context *sctx)
|
||||
static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
|
||||
{
|
||||
struct si_descriptors *desc = &sctx->vertex_buffers;
|
||||
bool bound[SI_NUM_VERTEX_BUFFERS] = {};
|
||||
|
|
@ -573,8 +458,10 @@ void si_update_vertex_buffers(struct si_context *sctx)
|
|||
uint64_t va;
|
||||
uint32_t *ptr;
|
||||
|
||||
if (!sctx->vertex_buffers_dirty)
|
||||
return true;
|
||||
if (!count || !sctx->vertex_elements)
|
||||
return;
|
||||
return true;
|
||||
|
||||
/* Vertex buffer descriptors are the only ones which are uploaded
|
||||
* directly through a staging buffer and don't go through
|
||||
|
|
@ -582,13 +469,14 @@ void si_update_vertex_buffers(struct si_context *sctx)
|
|||
*/
|
||||
u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
|
||||
(struct pipe_resource**)&desc->buffer, (void**)&ptr);
|
||||
if (!desc->buffer)
|
||||
return false;
|
||||
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
|
||||
desc->buffer, RADEON_USAGE_READ,
|
||||
RADEON_PRIO_SHADER_DATA);
|
||||
|
||||
assert(count <= SI_NUM_VERTEX_BUFFERS);
|
||||
assert(desc->current_context_id == 0);
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
|
||||
|
|
@ -640,6 +528,8 @@ void si_update_vertex_buffers(struct si_context *sctx)
|
|||
* cache is needed. */
|
||||
desc->pointer_dirty = true;
|
||||
sctx->shader_userdata.atom.dirty = true;
|
||||
sctx->vertex_buffers_dirty = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -664,7 +554,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
|
|||
if (shader >= SI_NUM_SHADERS)
|
||||
return;
|
||||
|
||||
assert(slot < buffers->num_buffers);
|
||||
assert(slot < buffers->desc.num_elements);
|
||||
pipe_resource_reference(&buffers->buffers[slot], NULL);
|
||||
|
||||
/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
|
||||
|
|
@ -691,7 +581,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
|
|||
}
|
||||
|
||||
/* Set the descriptor. */
|
||||
uint32_t *desc = buffers->desc_data[slot];
|
||||
uint32_t *desc = buffers->desc.list + slot*4;
|
||||
desc[0] = va;
|
||||
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
|
||||
S_008F04_STRIDE(0);
|
||||
|
|
@ -710,12 +600,11 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
|
|||
buffers->desc.enabled_mask |= 1llu << slot;
|
||||
} else {
|
||||
/* Clear the descriptor. */
|
||||
memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
|
||||
memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
|
||||
buffers->desc.enabled_mask &= ~(1llu << slot);
|
||||
}
|
||||
|
||||
buffers->desc.dirty_mask |= 1llu << slot;
|
||||
si_update_descriptors(sctx, &buffers->desc);
|
||||
buffers->desc.list_dirty = true;
|
||||
}
|
||||
|
||||
/* RING BUFFERS */
|
||||
|
|
@ -735,7 +624,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
|
|||
/* The stride field in the resource descriptor has 14 bits */
|
||||
assert(stride < (1 << 14));
|
||||
|
||||
assert(slot < buffers->num_buffers);
|
||||
assert(slot < buffers->desc.num_elements);
|
||||
pipe_resource_reference(&buffers->buffers[slot], NULL);
|
||||
|
||||
if (buffer) {
|
||||
|
|
@ -780,7 +669,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
|
|||
}
|
||||
|
||||
/* Set the descriptor. */
|
||||
uint32_t *desc = buffers->desc_data[slot];
|
||||
uint32_t *desc = buffers->desc.list + slot*4;
|
||||
desc[0] = va;
|
||||
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
|
||||
S_008F04_STRIDE(stride) |
|
||||
|
|
@ -803,12 +692,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
|
|||
buffers->desc.enabled_mask |= 1llu << slot;
|
||||
} else {
|
||||
/* Clear the descriptor. */
|
||||
memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
|
||||
memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
|
||||
buffers->desc.enabled_mask &= ~(1llu << slot);
|
||||
}
|
||||
|
||||
buffers->desc.dirty_mask |= 1llu << slot;
|
||||
si_update_descriptors(sctx, &buffers->desc);
|
||||
buffers->desc.list_dirty = true;
|
||||
}
|
||||
|
||||
/* STREAMOUT BUFFERS */
|
||||
|
|
@ -870,7 +758,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
|
|||
uint64_t va = r600_resource(buffer)->gpu_address;
|
||||
|
||||
/* Set the descriptor. */
|
||||
uint32_t *desc = buffers->desc_data[bufidx];
|
||||
uint32_t *desc = buffers->desc.list + bufidx*4;
|
||||
desc[0] = va;
|
||||
desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
|
||||
desc[2] = 0xffffffff;
|
||||
|
|
@ -888,24 +776,22 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
|
|||
buffers->desc.enabled_mask |= 1llu << bufidx;
|
||||
} else {
|
||||
/* Clear the descriptor and unset the resource. */
|
||||
memset(buffers->desc_data[bufidx], 0,
|
||||
memset(buffers->desc.list + bufidx*4, 0,
|
||||
sizeof(uint32_t) * 4);
|
||||
pipe_resource_reference(&buffers->buffers[bufidx],
|
||||
NULL);
|
||||
buffers->desc.enabled_mask &= ~(1llu << bufidx);
|
||||
}
|
||||
buffers->desc.dirty_mask |= 1llu << bufidx;
|
||||
}
|
||||
for (; i < old_num_targets; i++) {
|
||||
bufidx = SI_SO_BUF_OFFSET + i;
|
||||
/* Clear the descriptor and unset the resource. */
|
||||
memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
|
||||
memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
|
||||
pipe_resource_reference(&buffers->buffers[bufidx], NULL);
|
||||
buffers->desc.enabled_mask &= ~(1llu << bufidx);
|
||||
buffers->desc.dirty_mask |= 1llu << bufidx;
|
||||
}
|
||||
|
||||
si_update_descriptors(sctx, &buffers->desc);
|
||||
buffers->desc.list_dirty = true;
|
||||
}
|
||||
|
||||
static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
|
||||
|
|
@ -974,22 +860,19 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
|
|||
/* Read/Write buffers. */
|
||||
for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
|
||||
struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
|
||||
bool found = false;
|
||||
uint64_t mask = buffers->desc.enabled_mask;
|
||||
|
||||
while (mask) {
|
||||
i = u_bit_scan64(&mask);
|
||||
if (buffers->buffers[i] == buf) {
|
||||
si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
|
||||
si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
|
||||
old_va, buf);
|
||||
buffers->desc.list_dirty = true;
|
||||
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
|
||||
rbuffer, buffers->shader_usage,
|
||||
buffers->priority);
|
||||
|
||||
buffers->desc.dirty_mask |= 1llu << i;
|
||||
found = true;
|
||||
|
||||
if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
|
||||
/* Update the streamout state. */
|
||||
if (sctx->b.streamout.begin_emitted) {
|
||||
|
|
@ -1001,34 +884,25 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
|
|||
}
|
||||
}
|
||||
}
|
||||
if (found) {
|
||||
si_update_descriptors(sctx, &buffers->desc);
|
||||
}
|
||||
}
|
||||
|
||||
/* Constant buffers. */
|
||||
for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
|
||||
struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
|
||||
bool found = false;
|
||||
uint64_t mask = buffers->desc.enabled_mask;
|
||||
|
||||
while (mask) {
|
||||
unsigned i = u_bit_scan64(&mask);
|
||||
if (buffers->buffers[i] == buf) {
|
||||
si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
|
||||
si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
|
||||
old_va, buf);
|
||||
buffers->desc.list_dirty = true;
|
||||
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
|
||||
rbuffer, buffers->shader_usage,
|
||||
buffers->priority);
|
||||
|
||||
buffers->desc.dirty_mask |= 1llu << i;
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
if (found) {
|
||||
si_update_descriptors(sctx, &buffers->desc);
|
||||
}
|
||||
}
|
||||
|
||||
/* Texture buffers - update virtual addresses in sampler view descriptors. */
|
||||
|
|
@ -1040,23 +914,20 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
|
|||
/* Texture buffers - update bindings. */
|
||||
for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
|
||||
struct si_sampler_views *views = &sctx->samplers[shader].views;
|
||||
bool found = false;
|
||||
uint64_t mask = views->desc.enabled_mask;
|
||||
|
||||
while (mask) {
|
||||
unsigned i = u_bit_scan64(&mask);
|
||||
if (views->views[i]->texture == buf) {
|
||||
si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4,
|
||||
old_va, buf);
|
||||
views->desc.list_dirty = true;
|
||||
|
||||
r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
|
||||
rbuffer, RADEON_USAGE_READ,
|
||||
RADEON_PRIO_SHADER_BUFFER_RO);
|
||||
|
||||
views->desc.dirty_mask |= 1llu << i;
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
if (found) {
|
||||
si_update_descriptors(sctx, &views->desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1297,11 +1168,10 @@ static void si_emit_shader_pointer(struct si_context *sctx,
|
|||
struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
|
||||
uint64_t va;
|
||||
|
||||
if (!desc->pointer_dirty)
|
||||
if (!desc->pointer_dirty || !desc->buffer)
|
||||
return;
|
||||
|
||||
va = desc->buffer->gpu_address +
|
||||
desc->current_context_id * desc->context_size +
|
||||
desc->buffer_offset;
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
|
||||
|
|
@ -1351,34 +1221,28 @@ static void si_emit_shader_userdata(struct si_context *sctx,
|
|||
si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
|
||||
}
|
||||
|
||||
/* INIT/DEINIT */
|
||||
/* INIT/DEINIT/UPLOAD */
|
||||
|
||||
void si_init_all_descriptors(struct si_context *sctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < SI_NUM_SHADERS; i++) {
|
||||
si_init_buffer_resources(sctx, &sctx->const_buffers[i],
|
||||
si_init_buffer_resources(&sctx->const_buffers[i],
|
||||
SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
|
||||
RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
|
||||
si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
|
||||
si_init_buffer_resources(&sctx->rw_buffers[i],
|
||||
SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
|
||||
RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
|
||||
|
||||
si_init_sampler_views(sctx, &sctx->samplers[i].views);
|
||||
|
||||
si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
|
||||
SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES,
|
||||
si_emit_sampler_states);
|
||||
|
||||
sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
|
||||
sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
|
||||
sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
|
||||
sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
|
||||
si_init_descriptors(&sctx->samplers[i].views.desc,
|
||||
SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
|
||||
si_init_descriptors(&sctx->samplers[i].states.desc,
|
||||
SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
|
||||
}
|
||||
|
||||
si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
|
||||
4, SI_NUM_VERTEX_BUFFERS, NULL);
|
||||
si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
|
||||
4, SI_NUM_VERTEX_BUFFERS);
|
||||
|
||||
/* Set pipe_context functions. */
|
||||
sctx->b.b.set_constant_buffer = si_set_constant_buffer;
|
||||
|
|
@ -1401,6 +1265,20 @@ void si_init_all_descriptors(struct si_context *sctx)
|
|||
si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
|
||||
}
|
||||
|
||||
bool si_upload_shader_descriptors(struct si_context *sctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < SI_NUM_SHADERS; i++) {
|
||||
if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
|
||||
!si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
|
||||
!si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
|
||||
!si_upload_descriptors(sctx, &sctx->samplers[i].states.desc))
|
||||
return false;
|
||||
}
|
||||
return si_upload_vertex_buffer_descriptors(sctx);
|
||||
}
|
||||
|
||||
void si_release_all_descriptors(struct si_context *sctx)
|
||||
{
|
||||
int i;
|
||||
|
|
|
|||
|
|
@ -142,12 +142,6 @@ struct si_context {
|
|||
union {
|
||||
struct {
|
||||
/* The order matters. */
|
||||
struct r600_atom *const_buffers[SI_NUM_SHADERS];
|
||||
struct r600_atom *rw_buffers[SI_NUM_SHADERS];
|
||||
struct r600_atom *sampler_views[SI_NUM_SHADERS];
|
||||
struct r600_atom *sampler_states[SI_NUM_SHADERS];
|
||||
/* Caches must be flushed after resource descriptors are
|
||||
* updated in memory. */
|
||||
struct r600_atom *cache_flush;
|
||||
struct r600_atom *streamout_begin;
|
||||
struct r600_atom *streamout_enable; /* must be after streamout_begin */
|
||||
|
|
|
|||
|
|
@ -158,60 +158,48 @@ struct si_shader_data {
|
|||
#define SI_NUM_VERTEX_BUFFERS 16
|
||||
|
||||
|
||||
/* This represents resource descriptors in memory, such as buffer resources,
|
||||
/* This represents descriptors in memory, such as buffer resources,
|
||||
* image resources, and sampler states.
|
||||
*/
|
||||
struct si_descriptors {
|
||||
struct r600_atom atom;
|
||||
|
||||
/* The size of one resource descriptor. */
|
||||
/* The list of descriptors in malloc'd memory. */
|
||||
uint32_t *list;
|
||||
/* The size of one descriptor. */
|
||||
unsigned element_dw_size;
|
||||
/* The maximum number of resource descriptors. */
|
||||
/* The maximum number of descriptors. */
|
||||
unsigned num_elements;
|
||||
/* Whether the list has been changed and should be re-uploaded. */
|
||||
bool list_dirty;
|
||||
|
||||
/* The buffer where resource descriptors are stored. */
|
||||
/* The buffer where the descriptors have been uploaded. */
|
||||
struct r600_resource *buffer;
|
||||
unsigned buffer_offset;
|
||||
|
||||
/* The i-th bit is set if that element is dirty (changed but not emitted). */
|
||||
uint64_t dirty_mask;
|
||||
/* The i-th bit is set if that element is enabled (non-NULL resource). */
|
||||
uint64_t enabled_mask;
|
||||
|
||||
/* We can't update descriptors directly because the GPU might be
|
||||
* reading them at the same time, so we have to update them
|
||||
* in a copy-on-write manner. Each such copy is called a context,
|
||||
* which is just another array descriptors in the same buffer. */
|
||||
unsigned current_context_id;
|
||||
/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
|
||||
unsigned context_size;
|
||||
|
||||
/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
|
||||
* array will be stored. */
|
||||
unsigned shader_userdata_offset;
|
||||
/* Whether the pointer should be re-emitted. */
|
||||
bool pointer_dirty;
|
||||
};
|
||||
|
||||
struct si_sampler_views {
|
||||
struct si_descriptors desc;
|
||||
struct pipe_sampler_view *views[SI_NUM_SAMPLER_VIEWS];
|
||||
uint32_t *desc_data[SI_NUM_SAMPLER_VIEWS];
|
||||
};
|
||||
|
||||
struct si_sampler_states {
|
||||
struct si_descriptors desc;
|
||||
uint32_t *desc_data[SI_NUM_SAMPLER_STATES];
|
||||
void *saved_states[2]; /* saved for u_blitter */
|
||||
};
|
||||
|
||||
struct si_buffer_resources {
|
||||
struct si_descriptors desc;
|
||||
unsigned num_buffers;
|
||||
enum radeon_bo_usage shader_usage; /* READ, WRITE, or READWRITE */
|
||||
enum radeon_bo_priority priority;
|
||||
struct pipe_resource **buffers; /* this has num_buffers elements */
|
||||
uint32_t *desc_storage; /* this has num_buffers*4 elements */
|
||||
uint32_t **desc_data; /* an array of pointers pointing to desc_storage */
|
||||
};
|
||||
|
||||
#define si_pm4_block_idx(member) \
|
||||
|
|
@ -247,13 +235,13 @@ struct si_buffer_resources {
|
|||
/* si_descriptors.c */
|
||||
void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
|
||||
unsigned start, unsigned count, void **states);
|
||||
void si_update_vertex_buffers(struct si_context *sctx);
|
||||
void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
|
||||
struct pipe_resource *buffer,
|
||||
unsigned stride, unsigned num_records,
|
||||
bool add_tid, bool swizzle,
|
||||
unsigned element_size, unsigned index_stride, uint64_t offset);
|
||||
void si_init_all_descriptors(struct si_context *sctx);
|
||||
bool si_upload_shader_descriptors(struct si_context *sctx);
|
||||
void si_release_all_descriptors(struct si_context *sctx);
|
||||
void si_all_descriptors_begin_new_cs(struct si_context *sctx);
|
||||
void si_copy_buffer(struct si_context *sctx,
|
||||
|
|
|
|||
|
|
@ -743,11 +743,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
|
|||
sctx->current_rast_prim = info->mode;
|
||||
|
||||
si_update_shaders(sctx);
|
||||
|
||||
if (sctx->vertex_buffers_dirty) {
|
||||
si_update_vertex_buffers(sctx);
|
||||
sctx->vertex_buffers_dirty = false;
|
||||
}
|
||||
if (!si_upload_shader_descriptors(sctx))
|
||||
return;
|
||||
|
||||
if (info->indexed) {
|
||||
/* Initialize the index buffer struct. */
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue