mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-07 06:30:11 +01:00
radeonsi: switch to 3-spaces style
Generated automatically using clang-format and the following config:
AlignAfterOpenBracket: true
AlignConsecutiveMacros: true
AllowAllArgumentsOnNextLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: false
AlwaysBreakAfterReturnType: None
BasedOnStyle: LLVM
BraceWrapping:
AfterControlStatement: false
AfterEnum: true
AfterFunction: true
AfterStruct: false
BeforeElse: false
SplitEmptyFunction: true
BinPackArguments: true
BinPackParameters: true
BreakBeforeBraces: Custom
ColumnLimit: 100
ContinuationIndentWidth: 3
Cpp11BracedListStyle: false
Cpp11BracedListStyle: true
ForEachMacros:
- LIST_FOR_EACH_ENTRY
- LIST_FOR_EACH_ENTRY_SAFE
- util_dynarray_foreach
- nir_foreach_variable
- nir_foreach_variable_safe
- nir_foreach_register
- nir_foreach_register_safe
- nir_foreach_use
- nir_foreach_use_safe
- nir_foreach_if_use
- nir_foreach_if_use_safe
- nir_foreach_def
- nir_foreach_def_safe
- nir_foreach_phi_src
- nir_foreach_phi_src_safe
- nir_foreach_parallel_copy_entry
- nir_foreach_instr
- nir_foreach_instr_reverse
- nir_foreach_instr_safe
- nir_foreach_instr_reverse_safe
- nir_foreach_function
- nir_foreach_block
- nir_foreach_block_safe
- nir_foreach_block_reverse
- nir_foreach_block_reverse_safe
- nir_foreach_block_in_cf_node
IncludeBlocks: Regroup
IncludeCategories:
- Regex: '<[[:alnum:].]+>'
Priority: 2
- Regex: '.*'
Priority: 1
IndentWidth: 3
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyExcessCharacter: 100
SpaceAfterCStyleCast: false
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: false
SpacesInContainerLiterals: false
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4319>
This commit is contained in:
parent
53e5e802f8
commit
d7008fe46a
52 changed files with 37663 additions and 41424 deletions
|
|
@ -1,3 +0,0 @@
|
|||
[*.{c,h}]
|
||||
indent_style = tab
|
||||
indent_size = tab
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,18 +1,18 @@
|
|||
// DriConf options specific to radeonsi
|
||||
DRI_CONF_SECTION_PERFORMANCE
|
||||
DRI_CONF_ADAPTIVE_SYNC("true")
|
||||
DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
|
||||
DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
|
||||
DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
|
||||
DRI_CONF_ADAPTIVE_SYNC("true")
|
||||
DRI_CONF_RADEONSI_ASSUME_NO_Z_FIGHTS("false")
|
||||
DRI_CONF_RADEONSI_COMMUTATIVE_BLEND_ADD("false")
|
||||
DRI_CONF_RADEONSI_ZERO_ALL_VRAM_ALLOCS("false")
|
||||
DRI_CONF_SECTION_END
|
||||
|
||||
DRI_CONF_SECTION_DEBUG
|
||||
|
||||
//= BEGIN VERBATIM
|
||||
#define OPT_BOOL(name, dflt, description) \
|
||||
DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \
|
||||
DRI_CONF_DESC(en, description) \
|
||||
DRI_CONF_OPT_END
|
||||
#define OPT_BOOL(name, dflt, description) \
|
||||
DRI_CONF_OPT_BEGIN_B(radeonsi_##name, #dflt) \
|
||||
DRI_CONF_DESC(en, description) \
|
||||
DRI_CONF_OPT_END
|
||||
|
||||
#include "radeonsi/si_debug_options.h"
|
||||
//= END VERBATIM
|
||||
|
|
|
|||
|
|
@ -22,13 +22,13 @@
|
|||
* USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "si_pipe.h"
|
||||
#include "si_query.h"
|
||||
#include "sid.h"
|
||||
#include "util/u_memory.h"
|
||||
#include "util/u_suballoc.h"
|
||||
#include "sid.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
/**
|
||||
* The query buffer is written to by ESGS NGG shaders with statistics about
|
||||
|
|
@ -39,12 +39,12 @@
|
|||
* without additional GPU cost.
|
||||
*/
|
||||
struct gfx10_sh_query_buffer {
|
||||
struct list_head list;
|
||||
struct si_resource *buf;
|
||||
unsigned refcount;
|
||||
struct list_head list;
|
||||
struct si_resource *buf;
|
||||
unsigned refcount;
|
||||
|
||||
/* Offset into the buffer in bytes; points at the first un-emitted entry. */
|
||||
unsigned head;
|
||||
/* Offset into the buffer in bytes; points at the first un-emitted entry. */
|
||||
unsigned head;
|
||||
};
|
||||
|
||||
/* Memory layout of the query buffer. Must be kept in sync with shaders
|
||||
|
|
@ -55,469 +55,454 @@ struct gfx10_sh_query_buffer {
|
|||
* of all those values unconditionally.
|
||||
*/
|
||||
struct gfx10_sh_query_buffer_mem {
|
||||
struct {
|
||||
uint64_t generated_primitives_start_dummy;
|
||||
uint64_t emitted_primitives_start_dummy;
|
||||
uint64_t generated_primitives;
|
||||
uint64_t emitted_primitives;
|
||||
} stream[4];
|
||||
uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
|
||||
uint32_t pad[31];
|
||||
struct {
|
||||
uint64_t generated_primitives_start_dummy;
|
||||
uint64_t emitted_primitives_start_dummy;
|
||||
uint64_t generated_primitives;
|
||||
uint64_t emitted_primitives;
|
||||
} stream[4];
|
||||
uint32_t fence; /* bottom-of-pipe fence: set to ~0 when draws have finished */
|
||||
uint32_t pad[31];
|
||||
};
|
||||
|
||||
/* Shader-based queries. */
|
||||
struct gfx10_sh_query {
|
||||
struct si_query b;
|
||||
struct si_query b;
|
||||
|
||||
struct gfx10_sh_query_buffer *first;
|
||||
struct gfx10_sh_query_buffer *last;
|
||||
unsigned first_begin;
|
||||
unsigned last_end;
|
||||
struct gfx10_sh_query_buffer *first;
|
||||
struct gfx10_sh_query_buffer *last;
|
||||
unsigned first_begin;
|
||||
unsigned last_end;
|
||||
|
||||
unsigned stream;
|
||||
unsigned stream;
|
||||
};
|
||||
|
||||
static void emit_shader_query(struct si_context *sctx)
|
||||
{
|
||||
assert(!list_is_empty(&sctx->shader_query_buffers));
|
||||
assert(!list_is_empty(&sctx->shader_query_buffers));
|
||||
|
||||
struct gfx10_sh_query_buffer *qbuf = list_last_entry(&sctx->shader_query_buffers,
|
||||
struct gfx10_sh_query_buffer, list);
|
||||
qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
struct gfx10_sh_query_buffer *qbuf =
|
||||
list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
|
||||
qbuf->head += sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
}
|
||||
|
||||
static void gfx10_release_query_buffers(struct si_context *sctx,
|
||||
struct gfx10_sh_query_buffer *first,
|
||||
struct gfx10_sh_query_buffer *last)
|
||||
struct gfx10_sh_query_buffer *first,
|
||||
struct gfx10_sh_query_buffer *last)
|
||||
{
|
||||
while (first) {
|
||||
struct gfx10_sh_query_buffer *qbuf = first;
|
||||
if (first != last)
|
||||
first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
|
||||
else
|
||||
first = NULL;
|
||||
while (first) {
|
||||
struct gfx10_sh_query_buffer *qbuf = first;
|
||||
if (first != last)
|
||||
first = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
|
||||
else
|
||||
first = NULL;
|
||||
|
||||
qbuf->refcount--;
|
||||
if (qbuf->refcount)
|
||||
continue;
|
||||
qbuf->refcount--;
|
||||
if (qbuf->refcount)
|
||||
continue;
|
||||
|
||||
if (qbuf->list.next == &sctx->shader_query_buffers)
|
||||
continue; /* keep the most recent buffer; it may not be full yet */
|
||||
if (qbuf->list.prev == &sctx->shader_query_buffers)
|
||||
continue; /* keep the oldest buffer for recycling */
|
||||
if (qbuf->list.next == &sctx->shader_query_buffers)
|
||||
continue; /* keep the most recent buffer; it may not be full yet */
|
||||
if (qbuf->list.prev == &sctx->shader_query_buffers)
|
||||
continue; /* keep the oldest buffer for recycling */
|
||||
|
||||
list_del(&qbuf->list);
|
||||
si_resource_reference(&qbuf->buf, NULL);
|
||||
FREE(qbuf);
|
||||
}
|
||||
list_del(&qbuf->list);
|
||||
si_resource_reference(&qbuf->buf, NULL);
|
||||
FREE(qbuf);
|
||||
}
|
||||
}
|
||||
|
||||
static bool gfx10_alloc_query_buffer(struct si_context *sctx)
|
||||
{
|
||||
if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
|
||||
return true;
|
||||
if (si_is_atom_dirty(sctx, &sctx->atoms.s.shader_query))
|
||||
return true;
|
||||
|
||||
struct gfx10_sh_query_buffer *qbuf = NULL;
|
||||
struct gfx10_sh_query_buffer *qbuf = NULL;
|
||||
|
||||
if (!list_is_empty(&sctx->shader_query_buffers)) {
|
||||
qbuf = list_last_entry(&sctx->shader_query_buffers,
|
||||
struct gfx10_sh_query_buffer, list);
|
||||
if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
|
||||
goto success;
|
||||
if (!list_is_empty(&sctx->shader_query_buffers)) {
|
||||
qbuf = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
|
||||
if (qbuf->head + sizeof(struct gfx10_sh_query_buffer_mem) <= qbuf->buf->b.b.width0)
|
||||
goto success;
|
||||
|
||||
qbuf = list_first_entry(&sctx->shader_query_buffers,
|
||||
struct gfx10_sh_query_buffer, list);
|
||||
if (!qbuf->refcount &&
|
||||
!si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
|
||||
sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
|
||||
/* Can immediately re-use the oldest buffer */
|
||||
list_del(&qbuf->list);
|
||||
} else {
|
||||
qbuf = NULL;
|
||||
}
|
||||
}
|
||||
qbuf = list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
|
||||
if (!qbuf->refcount &&
|
||||
!si_rings_is_buffer_referenced(sctx, qbuf->buf->buf, RADEON_USAGE_READWRITE) &&
|
||||
sctx->ws->buffer_wait(qbuf->buf->buf, 0, RADEON_USAGE_READWRITE)) {
|
||||
/* Can immediately re-use the oldest buffer */
|
||||
list_del(&qbuf->list);
|
||||
} else {
|
||||
qbuf = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (!qbuf) {
|
||||
qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
|
||||
if (unlikely(!qbuf))
|
||||
return false;
|
||||
if (!qbuf) {
|
||||
qbuf = CALLOC_STRUCT(gfx10_sh_query_buffer);
|
||||
if (unlikely(!qbuf))
|
||||
return false;
|
||||
|
||||
struct si_screen *screen = sctx->screen;
|
||||
unsigned buf_size = MAX2(sizeof(struct gfx10_sh_query_buffer_mem),
|
||||
screen->info.min_alloc_size);
|
||||
qbuf->buf = si_resource(
|
||||
pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
|
||||
if (unlikely(!qbuf->buf)) {
|
||||
FREE(qbuf);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
struct si_screen *screen = sctx->screen;
|
||||
unsigned buf_size =
|
||||
MAX2(sizeof(struct gfx10_sh_query_buffer_mem), screen->info.min_alloc_size);
|
||||
qbuf->buf = si_resource(pipe_buffer_create(&screen->b, 0, PIPE_USAGE_STAGING, buf_size));
|
||||
if (unlikely(!qbuf->buf)) {
|
||||
FREE(qbuf);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* The buffer is currently unused by the GPU. Initialize it.
|
||||
*
|
||||
* We need to set the high bit of all the primitive counters for
|
||||
* compatibility with the SET_PREDICATION packet.
|
||||
*/
|
||||
uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
|
||||
PIPE_TRANSFER_WRITE |
|
||||
PIPE_TRANSFER_UNSYNCHRONIZED);
|
||||
assert(results);
|
||||
/* The buffer is currently unused by the GPU. Initialize it.
|
||||
*
|
||||
* We need to set the high bit of all the primitive counters for
|
||||
* compatibility with the SET_PREDICATION packet.
|
||||
*/
|
||||
uint64_t *results = sctx->ws->buffer_map(qbuf->buf->buf, NULL,
|
||||
PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED);
|
||||
assert(results);
|
||||
|
||||
for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
i < e; ++i) {
|
||||
for (unsigned j = 0; j < 16; ++j)
|
||||
results[32 * i + j] = (uint64_t)1 << 63;
|
||||
results[32 * i + 16] = 0;
|
||||
}
|
||||
for (unsigned i = 0, e = qbuf->buf->b.b.width0 / sizeof(struct gfx10_sh_query_buffer_mem); i < e;
|
||||
++i) {
|
||||
for (unsigned j = 0; j < 16; ++j)
|
||||
results[32 * i + j] = (uint64_t)1 << 63;
|
||||
results[32 * i + 16] = 0;
|
||||
}
|
||||
|
||||
list_addtail(&qbuf->list, &sctx->shader_query_buffers);
|
||||
qbuf->head = 0;
|
||||
qbuf->refcount = sctx->num_active_shader_queries;
|
||||
list_addtail(&qbuf->list, &sctx->shader_query_buffers);
|
||||
qbuf->head = 0;
|
||||
qbuf->refcount = sctx->num_active_shader_queries;
|
||||
|
||||
success:;
|
||||
struct pipe_shader_buffer sbuf;
|
||||
sbuf.buffer = &qbuf->buf->b.b;
|
||||
sbuf.buffer_offset = qbuf->head;
|
||||
sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
|
||||
sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
|
||||
struct pipe_shader_buffer sbuf;
|
||||
sbuf.buffer = &qbuf->buf->b.b;
|
||||
sbuf.buffer_offset = qbuf->head;
|
||||
sbuf.buffer_size = sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, &sbuf);
|
||||
sctx->current_vs_state |= S_VS_STATE_STREAMOUT_QUERY_ENABLED(1);
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
|
||||
return true;
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_query);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void gfx10_sh_query_destroy(struct si_context *sctx, struct si_query *rquery)
|
||||
{
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
gfx10_release_query_buffers(sctx, query->first, query->last);
|
||||
FREE(query);
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
gfx10_release_query_buffers(sctx, query->first, query->last);
|
||||
FREE(query);
|
||||
}
|
||||
|
||||
static bool gfx10_sh_query_begin(struct si_context *sctx, struct si_query *rquery)
|
||||
{
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
|
||||
gfx10_release_query_buffers(sctx, query->first, query->last);
|
||||
query->first = query->last = NULL;
|
||||
gfx10_release_query_buffers(sctx, query->first, query->last);
|
||||
query->first = query->last = NULL;
|
||||
|
||||
if (unlikely(!gfx10_alloc_query_buffer(sctx)))
|
||||
return false;
|
||||
if (unlikely(!gfx10_alloc_query_buffer(sctx)))
|
||||
return false;
|
||||
|
||||
query->first = list_last_entry(&sctx->shader_query_buffers,
|
||||
struct gfx10_sh_query_buffer, list);
|
||||
query->first_begin = query->first->head;
|
||||
query->first = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
|
||||
query->first_begin = query->first->head;
|
||||
|
||||
sctx->num_active_shader_queries++;
|
||||
query->first->refcount++;
|
||||
sctx->num_active_shader_queries++;
|
||||
query->first->refcount++;
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool gfx10_sh_query_end(struct si_context *sctx, struct si_query *rquery)
|
||||
{
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
|
||||
if (unlikely(!query->first))
|
||||
return false; /* earlier out of memory error */
|
||||
if (unlikely(!query->first))
|
||||
return false; /* earlier out of memory error */
|
||||
|
||||
query->last = list_last_entry(&sctx->shader_query_buffers,
|
||||
struct gfx10_sh_query_buffer, list);
|
||||
query->last_end = query->last->head;
|
||||
query->last = list_last_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
|
||||
query->last_end = query->last->head;
|
||||
|
||||
/* Signal the fence of the previous chunk */
|
||||
if (query->last_end != 0) {
|
||||
uint64_t fence_va = query->last->buf->gpu_address;
|
||||
fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
|
||||
si_cp_release_mem(sctx, sctx->gfx_cs,
|
||||
V_028A90_BOTTOM_OF_PIPE_TS, 0,
|
||||
EOP_DST_SEL_MEM, EOP_INT_SEL_NONE,
|
||||
EOP_DATA_SEL_VALUE_32BIT,
|
||||
query->last->buf, fence_va, 0xffffffff,
|
||||
PIPE_QUERY_GPU_FINISHED);
|
||||
}
|
||||
/* Signal the fence of the previous chunk */
|
||||
if (query->last_end != 0) {
|
||||
uint64_t fence_va = query->last->buf->gpu_address;
|
||||
fence_va += query->last_end - sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
fence_va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
|
||||
si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
|
||||
EOP_INT_SEL_NONE, EOP_DATA_SEL_VALUE_32BIT, query->last->buf, fence_va,
|
||||
0xffffffff, PIPE_QUERY_GPU_FINISHED);
|
||||
}
|
||||
|
||||
sctx->num_active_shader_queries--;
|
||||
sctx->num_active_shader_queries--;
|
||||
|
||||
if (sctx->num_active_shader_queries > 0) {
|
||||
gfx10_alloc_query_buffer(sctx);
|
||||
} else {
|
||||
si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
|
||||
sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
|
||||
if (sctx->num_active_shader_queries > 0) {
|
||||
gfx10_alloc_query_buffer(sctx);
|
||||
} else {
|
||||
si_set_rw_shader_buffer(sctx, GFX10_GS_QUERY_BUF, NULL);
|
||||
sctx->current_vs_state &= C_VS_STATE_STREAMOUT_QUERY_ENABLED;
|
||||
|
||||
/* If a query_begin is followed by a query_end without a draw
|
||||
* in-between, we need to clear the atom to ensure that the
|
||||
* next query_begin will re-initialize the shader buffer. */
|
||||
si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
|
||||
}
|
||||
/* If a query_begin is followed by a query_end without a draw
|
||||
* in-between, we need to clear the atom to ensure that the
|
||||
* next query_begin will re-initialize the shader buffer. */
|
||||
si_set_atom_dirty(sctx, &sctx->atoms.s.shader_query, false);
|
||||
}
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void gfx10_sh_query_add_result(struct gfx10_sh_query *query,
|
||||
struct gfx10_sh_query_buffer_mem *qmem,
|
||||
union pipe_query_result *result)
|
||||
struct gfx10_sh_query_buffer_mem *qmem,
|
||||
union pipe_query_result *result)
|
||||
{
|
||||
static const uint64_t mask = ((uint64_t)1 << 63) - 1;
|
||||
static const uint64_t mask = ((uint64_t)1 << 63) - 1;
|
||||
|
||||
switch (query->b.type) {
|
||||
case PIPE_QUERY_PRIMITIVES_EMITTED:
|
||||
result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
|
||||
break;
|
||||
case PIPE_QUERY_PRIMITIVES_GENERATED:
|
||||
result->u64 += qmem->stream[query->stream].generated_primitives & mask;
|
||||
break;
|
||||
case PIPE_QUERY_SO_STATISTICS:
|
||||
result->so_statistics.num_primitives_written +=
|
||||
qmem->stream[query->stream].emitted_primitives & mask;
|
||||
result->so_statistics.primitives_storage_needed +=
|
||||
qmem->stream[query->stream].generated_primitives & mask;
|
||||
break;
|
||||
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
|
||||
result->b |= qmem->stream[query->stream].emitted_primitives !=
|
||||
qmem->stream[query->stream].generated_primitives;
|
||||
break;
|
||||
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
|
||||
for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
|
||||
result->b |= qmem->stream[query->stream].emitted_primitives !=
|
||||
qmem->stream[query->stream].generated_primitives;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
switch (query->b.type) {
|
||||
case PIPE_QUERY_PRIMITIVES_EMITTED:
|
||||
result->u64 += qmem->stream[query->stream].emitted_primitives & mask;
|
||||
break;
|
||||
case PIPE_QUERY_PRIMITIVES_GENERATED:
|
||||
result->u64 += qmem->stream[query->stream].generated_primitives & mask;
|
||||
break;
|
||||
case PIPE_QUERY_SO_STATISTICS:
|
||||
result->so_statistics.num_primitives_written +=
|
||||
qmem->stream[query->stream].emitted_primitives & mask;
|
||||
result->so_statistics.primitives_storage_needed +=
|
||||
qmem->stream[query->stream].generated_primitives & mask;
|
||||
break;
|
||||
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
|
||||
result->b |= qmem->stream[query->stream].emitted_primitives !=
|
||||
qmem->stream[query->stream].generated_primitives;
|
||||
break;
|
||||
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
|
||||
for (unsigned stream = 0; stream < SI_MAX_STREAMS; ++stream) {
|
||||
result->b |= qmem->stream[query->stream].emitted_primitives !=
|
||||
qmem->stream[query->stream].generated_primitives;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery,
|
||||
bool wait, union pipe_query_result *result)
|
||||
static bool gfx10_sh_query_get_result(struct si_context *sctx, struct si_query *rquery, bool wait,
|
||||
union pipe_query_result *result)
|
||||
{
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
|
||||
util_query_clear_result(result, query->b.type);
|
||||
util_query_clear_result(result, query->b.type);
|
||||
|
||||
if (unlikely(!query->first))
|
||||
return false; /* earlier out of memory error */
|
||||
assert(query->last);
|
||||
if (unlikely(!query->first))
|
||||
return false; /* earlier out of memory error */
|
||||
assert(query->last);
|
||||
|
||||
for (struct gfx10_sh_query_buffer *qbuf = query->last;;
|
||||
qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
|
||||
unsigned usage = PIPE_TRANSFER_READ |
|
||||
(wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
|
||||
void *map;
|
||||
for (struct gfx10_sh_query_buffer *qbuf = query->last;;
|
||||
qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.prev, list)) {
|
||||
unsigned usage = PIPE_TRANSFER_READ | (wait ? 0 : PIPE_TRANSFER_DONTBLOCK);
|
||||
void *map;
|
||||
|
||||
if (rquery->b.flushed)
|
||||
map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
|
||||
else
|
||||
map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
|
||||
if (rquery->b.flushed)
|
||||
map = sctx->ws->buffer_map(qbuf->buf->buf, NULL, usage);
|
||||
else
|
||||
map = si_buffer_map_sync_with_rings(sctx, qbuf->buf, usage);
|
||||
|
||||
if (!map)
|
||||
return false;
|
||||
if (!map)
|
||||
return false;
|
||||
|
||||
unsigned results_begin = 0;
|
||||
unsigned results_end = qbuf->head;
|
||||
if (qbuf == query->first)
|
||||
results_begin = query->first_begin;
|
||||
if (qbuf == query->last)
|
||||
results_end = query->last_end;
|
||||
unsigned results_begin = 0;
|
||||
unsigned results_end = qbuf->head;
|
||||
if (qbuf == query->first)
|
||||
results_begin = query->first_begin;
|
||||
if (qbuf == query->last)
|
||||
results_end = query->last_end;
|
||||
|
||||
while (results_begin != results_end) {
|
||||
struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
|
||||
results_begin += sizeof(*qmem);
|
||||
while (results_begin != results_end) {
|
||||
struct gfx10_sh_query_buffer_mem *qmem = map + results_begin;
|
||||
results_begin += sizeof(*qmem);
|
||||
|
||||
gfx10_sh_query_add_result(query, qmem, result);
|
||||
}
|
||||
gfx10_sh_query_add_result(query, qmem, result);
|
||||
}
|
||||
|
||||
if (qbuf == query->first)
|
||||
break;
|
||||
}
|
||||
if (qbuf == query->first)
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
static void gfx10_sh_query_get_result_resource(struct si_context *sctx,
|
||||
struct si_query *rquery,
|
||||
bool wait,
|
||||
enum pipe_query_value_type result_type,
|
||||
int index,
|
||||
struct pipe_resource *resource,
|
||||
unsigned offset)
|
||||
static void gfx10_sh_query_get_result_resource(struct si_context *sctx, struct si_query *rquery,
|
||||
bool wait, enum pipe_query_value_type result_type,
|
||||
int index, struct pipe_resource *resource,
|
||||
unsigned offset)
|
||||
{
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
struct si_qbo_state saved_state = {};
|
||||
struct pipe_resource *tmp_buffer = NULL;
|
||||
unsigned tmp_buffer_offset = 0;
|
||||
struct gfx10_sh_query *query = (struct gfx10_sh_query *)rquery;
|
||||
struct si_qbo_state saved_state = {};
|
||||
struct pipe_resource *tmp_buffer = NULL;
|
||||
unsigned tmp_buffer_offset = 0;
|
||||
|
||||
if (!sctx->sh_query_result_shader) {
|
||||
sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
|
||||
if (!sctx->sh_query_result_shader)
|
||||
return;
|
||||
}
|
||||
if (!sctx->sh_query_result_shader) {
|
||||
sctx->sh_query_result_shader = gfx10_create_sh_query_result_cs(sctx);
|
||||
if (!sctx->sh_query_result_shader)
|
||||
return;
|
||||
}
|
||||
|
||||
if (query->first != query->last) {
|
||||
u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16,
|
||||
&tmp_buffer_offset, &tmp_buffer);
|
||||
if (!tmp_buffer)
|
||||
return;
|
||||
}
|
||||
if (query->first != query->last) {
|
||||
u_suballocator_alloc(sctx->allocator_zeroed_memory, 16, 16, &tmp_buffer_offset, &tmp_buffer);
|
||||
if (!tmp_buffer)
|
||||
return;
|
||||
}
|
||||
|
||||
si_save_qbo_state(sctx, &saved_state);
|
||||
si_save_qbo_state(sctx, &saved_state);
|
||||
|
||||
/* Pre-fill the constants configuring the shader behavior. */
|
||||
struct {
|
||||
uint32_t config;
|
||||
uint32_t offset;
|
||||
uint32_t chain;
|
||||
uint32_t result_count;
|
||||
} consts;
|
||||
struct pipe_constant_buffer constant_buffer = {};
|
||||
/* Pre-fill the constants configuring the shader behavior. */
|
||||
struct {
|
||||
uint32_t config;
|
||||
uint32_t offset;
|
||||
uint32_t chain;
|
||||
uint32_t result_count;
|
||||
} consts;
|
||||
struct pipe_constant_buffer constant_buffer = {};
|
||||
|
||||
if (index >= 0) {
|
||||
switch (query->b.type) {
|
||||
case PIPE_QUERY_PRIMITIVES_GENERATED:
|
||||
consts.offset = sizeof(uint32_t) * query->stream;
|
||||
consts.config = 0;
|
||||
break;
|
||||
case PIPE_QUERY_PRIMITIVES_EMITTED:
|
||||
consts.offset = sizeof(uint32_t) * (4 + query->stream);
|
||||
consts.config = 0;
|
||||
break;
|
||||
case PIPE_QUERY_SO_STATISTICS:
|
||||
consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
|
||||
consts.config = 0;
|
||||
break;
|
||||
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
|
||||
consts.offset = sizeof(uint32_t) * query->stream;
|
||||
consts.config = 2;
|
||||
break;
|
||||
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
|
||||
consts.offset = 0;
|
||||
consts.config = 3;
|
||||
break;
|
||||
default: unreachable("bad query type");
|
||||
}
|
||||
} else {
|
||||
/* Check result availability. */
|
||||
consts.offset = 0;
|
||||
consts.config = 1;
|
||||
}
|
||||
if (index >= 0) {
|
||||
switch (query->b.type) {
|
||||
case PIPE_QUERY_PRIMITIVES_GENERATED:
|
||||
consts.offset = sizeof(uint32_t) * query->stream;
|
||||
consts.config = 0;
|
||||
break;
|
||||
case PIPE_QUERY_PRIMITIVES_EMITTED:
|
||||
consts.offset = sizeof(uint32_t) * (4 + query->stream);
|
||||
consts.config = 0;
|
||||
break;
|
||||
case PIPE_QUERY_SO_STATISTICS:
|
||||
consts.offset = sizeof(uint32_t) * (4 * index + query->stream);
|
||||
consts.config = 0;
|
||||
break;
|
||||
case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
|
||||
consts.offset = sizeof(uint32_t) * query->stream;
|
||||
consts.config = 2;
|
||||
break;
|
||||
case PIPE_QUERY_SO_OVERFLOW_ANY_PREDICATE:
|
||||
consts.offset = 0;
|
||||
consts.config = 3;
|
||||
break;
|
||||
default:
|
||||
unreachable("bad query type");
|
||||
}
|
||||
} else {
|
||||
/* Check result availability. */
|
||||
consts.offset = 0;
|
||||
consts.config = 1;
|
||||
}
|
||||
|
||||
if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
|
||||
consts.config |= 8;
|
||||
if (result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64)
|
||||
consts.config |= 8;
|
||||
|
||||
constant_buffer.buffer_size = sizeof(consts);
|
||||
constant_buffer.user_buffer = &consts;
|
||||
constant_buffer.buffer_size = sizeof(consts);
|
||||
constant_buffer.user_buffer = &consts;
|
||||
|
||||
/* Pre-fill the SSBOs and grid. */
|
||||
struct pipe_shader_buffer ssbo[3];
|
||||
struct pipe_grid_info grid = {};
|
||||
/* Pre-fill the SSBOs and grid. */
|
||||
struct pipe_shader_buffer ssbo[3];
|
||||
struct pipe_grid_info grid = {};
|
||||
|
||||
ssbo[1].buffer = tmp_buffer;
|
||||
ssbo[1].buffer_offset = tmp_buffer_offset;
|
||||
ssbo[1].buffer_size = 16;
|
||||
ssbo[1].buffer = tmp_buffer;
|
||||
ssbo[1].buffer_offset = tmp_buffer_offset;
|
||||
ssbo[1].buffer_size = 16;
|
||||
|
||||
ssbo[2] = ssbo[1];
|
||||
ssbo[2] = ssbo[1];
|
||||
|
||||
sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
|
||||
sctx->b.bind_compute_state(&sctx->b, sctx->sh_query_result_shader);
|
||||
|
||||
grid.block[0] = 1;
|
||||
grid.block[1] = 1;
|
||||
grid.block[2] = 1;
|
||||
grid.grid[0] = 1;
|
||||
grid.grid[1] = 1;
|
||||
grid.grid[2] = 1;
|
||||
grid.block[0] = 1;
|
||||
grid.block[1] = 1;
|
||||
grid.block[2] = 1;
|
||||
grid.grid[0] = 1;
|
||||
grid.grid[1] = 1;
|
||||
grid.grid[2] = 1;
|
||||
|
||||
struct gfx10_sh_query_buffer *qbuf = query->first;
|
||||
for (;;) {
|
||||
unsigned begin = qbuf == query->first ? query->first_begin : 0;
|
||||
unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
|
||||
if (!end)
|
||||
continue;
|
||||
struct gfx10_sh_query_buffer *qbuf = query->first;
|
||||
for (;;) {
|
||||
unsigned begin = qbuf == query->first ? query->first_begin : 0;
|
||||
unsigned end = qbuf == query->last ? query->last_end : qbuf->buf->b.b.width0;
|
||||
if (!end)
|
||||
continue;
|
||||
|
||||
ssbo[0].buffer = &qbuf->buf->b.b;
|
||||
ssbo[0].buffer_offset = begin;
|
||||
ssbo[0].buffer_size = end - begin;
|
||||
ssbo[0].buffer = &qbuf->buf->b.b;
|
||||
ssbo[0].buffer_offset = begin;
|
||||
ssbo[0].buffer_size = end - begin;
|
||||
|
||||
consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
consts.chain = 0;
|
||||
if (qbuf != query->first)
|
||||
consts.chain |= 1;
|
||||
if (qbuf != query->last)
|
||||
consts.chain |= 2;
|
||||
consts.result_count = (end - begin) / sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
consts.chain = 0;
|
||||
if (qbuf != query->first)
|
||||
consts.chain |= 1;
|
||||
if (qbuf != query->last)
|
||||
consts.chain |= 2;
|
||||
|
||||
if (qbuf == query->last) {
|
||||
ssbo[2].buffer = resource;
|
||||
ssbo[2].buffer_offset = offset;
|
||||
ssbo[2].buffer_size = 8;
|
||||
}
|
||||
if (qbuf == query->last) {
|
||||
ssbo[2].buffer = resource;
|
||||
ssbo[2].buffer_offset = offset;
|
||||
ssbo[2].buffer_size = 8;
|
||||
}
|
||||
|
||||
sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
|
||||
sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
|
||||
sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &constant_buffer);
|
||||
sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo, 0x6);
|
||||
|
||||
if (wait) {
|
||||
uint64_t va;
|
||||
if (wait) {
|
||||
uint64_t va;
|
||||
|
||||
/* Wait for result availability. Wait only for readiness
|
||||
* of the last entry, since the fence writes should be
|
||||
* serialized in the CP.
|
||||
*/
|
||||
va = qbuf->buf->gpu_address;
|
||||
va += end - sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
|
||||
/* Wait for result availability. Wait only for readiness
|
||||
* of the last entry, since the fence writes should be
|
||||
* serialized in the CP.
|
||||
*/
|
||||
va = qbuf->buf->gpu_address;
|
||||
va += end - sizeof(struct gfx10_sh_query_buffer_mem);
|
||||
va += offsetof(struct gfx10_sh_query_buffer_mem, fence);
|
||||
|
||||
si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
|
||||
}
|
||||
si_cp_wait_mem(sctx, sctx->gfx_cs, va, 0x00000001, 0x00000001, 0);
|
||||
}
|
||||
|
||||
sctx->b.launch_grid(&sctx->b, &grid);
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
sctx->b.launch_grid(&sctx->b, &grid);
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
|
||||
if (qbuf == query->last)
|
||||
break;
|
||||
qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
|
||||
}
|
||||
if (qbuf == query->last)
|
||||
break;
|
||||
qbuf = LIST_ENTRY(struct gfx10_sh_query_buffer, qbuf->list.next, list);
|
||||
}
|
||||
|
||||
si_restore_qbo_state(sctx, &saved_state);
|
||||
pipe_resource_reference(&tmp_buffer, NULL);
|
||||
si_restore_qbo_state(sctx, &saved_state);
|
||||
pipe_resource_reference(&tmp_buffer, NULL);
|
||||
}
|
||||
|
||||
static const struct si_query_ops gfx10_sh_query_ops = {
|
||||
.destroy = gfx10_sh_query_destroy,
|
||||
.begin = gfx10_sh_query_begin,
|
||||
.end = gfx10_sh_query_end,
|
||||
.get_result = gfx10_sh_query_get_result,
|
||||
.get_result_resource = gfx10_sh_query_get_result_resource,
|
||||
.destroy = gfx10_sh_query_destroy,
|
||||
.begin = gfx10_sh_query_begin,
|
||||
.end = gfx10_sh_query_end,
|
||||
.get_result = gfx10_sh_query_get_result,
|
||||
.get_result_resource = gfx10_sh_query_get_result_resource,
|
||||
};
|
||||
|
||||
struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
|
||||
enum pipe_query_type query_type,
|
||||
unsigned index)
|
||||
struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
|
||||
unsigned index)
|
||||
{
|
||||
struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
|
||||
if (unlikely(!query))
|
||||
return NULL;
|
||||
struct gfx10_sh_query *query = CALLOC_STRUCT(gfx10_sh_query);
|
||||
if (unlikely(!query))
|
||||
return NULL;
|
||||
|
||||
query->b.ops = &gfx10_sh_query_ops;
|
||||
query->b.type = query_type;
|
||||
query->stream = index;
|
||||
query->b.ops = &gfx10_sh_query_ops;
|
||||
query->b.type = query_type;
|
||||
query->stream = index;
|
||||
|
||||
return (struct pipe_query *)query;
|
||||
return (struct pipe_query *)query;
|
||||
}
|
||||
|
||||
void gfx10_init_query(struct si_context *sctx)
|
||||
{
|
||||
list_inithead(&sctx->shader_query_buffers);
|
||||
sctx->atoms.s.shader_query.emit = emit_shader_query;
|
||||
list_inithead(&sctx->shader_query_buffers);
|
||||
sctx->atoms.s.shader_query.emit = emit_shader_query;
|
||||
}
|
||||
|
||||
void gfx10_destroy_query(struct si_context *sctx)
|
||||
{
|
||||
while (!list_is_empty(&sctx->shader_query_buffers)) {
|
||||
struct gfx10_sh_query_buffer *qbuf =
|
||||
list_first_entry(&sctx->shader_query_buffers,
|
||||
struct gfx10_sh_query_buffer, list);
|
||||
list_del(&qbuf->list);
|
||||
while (!list_is_empty(&sctx->shader_query_buffers)) {
|
||||
struct gfx10_sh_query_buffer *qbuf =
|
||||
list_first_entry(&sctx->shader_query_buffers, struct gfx10_sh_query_buffer, list);
|
||||
list_del(&qbuf->list);
|
||||
|
||||
assert(!qbuf->refcount);
|
||||
si_resource_reference(&qbuf->buf, NULL);
|
||||
FREE(qbuf);
|
||||
}
|
||||
assert(!qbuf->refcount);
|
||||
si_resource_reference(&qbuf->buf, NULL);
|
||||
FREE(qbuf);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -34,131 +34,128 @@
|
|||
|
||||
static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
|
||||
{
|
||||
assert(reg < SI_CONTEXT_REG_OFFSET);
|
||||
assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
|
||||
radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
|
||||
assert(reg < SI_CONTEXT_REG_OFFSET);
|
||||
assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_CONFIG_REG, num, 0));
|
||||
radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
|
||||
}
|
||||
|
||||
static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
|
||||
{
|
||||
radeon_set_config_reg_seq(cs, reg, 1);
|
||||
radeon_emit(cs, value);
|
||||
radeon_set_config_reg_seq(cs, reg, 1);
|
||||
radeon_emit(cs, value);
|
||||
}
|
||||
|
||||
static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
|
||||
{
|
||||
assert(reg >= SI_CONTEXT_REG_OFFSET);
|
||||
assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
|
||||
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
|
||||
assert(reg >= SI_CONTEXT_REG_OFFSET);
|
||||
assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, num, 0));
|
||||
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
|
||||
}
|
||||
|
||||
static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
|
||||
{
|
||||
radeon_set_context_reg_seq(cs, reg, 1);
|
||||
radeon_emit(cs, value);
|
||||
radeon_set_context_reg_seq(cs, reg, 1);
|
||||
radeon_emit(cs, value);
|
||||
}
|
||||
|
||||
static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
|
||||
unsigned reg, unsigned idx,
|
||||
unsigned value)
|
||||
static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs, unsigned reg, unsigned idx,
|
||||
unsigned value)
|
||||
{
|
||||
assert(reg >= SI_CONTEXT_REG_OFFSET);
|
||||
assert(cs->current.cdw + 3 <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
|
||||
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
|
||||
radeon_emit(cs, value);
|
||||
assert(reg >= SI_CONTEXT_REG_OFFSET);
|
||||
assert(cs->current.cdw + 3 <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_CONTEXT_REG, 1, 0));
|
||||
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2 | (idx << 28));
|
||||
radeon_emit(cs, value);
|
||||
}
|
||||
|
||||
static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
|
||||
{
|
||||
assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
|
||||
assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
|
||||
radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
|
||||
assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
|
||||
assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_SH_REG, num, 0));
|
||||
radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
|
||||
}
|
||||
|
||||
static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
|
||||
{
|
||||
radeon_set_sh_reg_seq(cs, reg, 1);
|
||||
radeon_emit(cs, value);
|
||||
radeon_set_sh_reg_seq(cs, reg, 1);
|
||||
radeon_emit(cs, value);
|
||||
}
|
||||
|
||||
static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
|
||||
{
|
||||
assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
|
||||
assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
|
||||
radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
|
||||
assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
|
||||
assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_SET_UCONFIG_REG, num, 0));
|
||||
radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
|
||||
}
|
||||
|
||||
static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
|
||||
{
|
||||
radeon_set_uconfig_reg_seq(cs, reg, 1);
|
||||
radeon_emit(cs, value);
|
||||
radeon_set_uconfig_reg_seq(cs, reg, 1);
|
||||
radeon_emit(cs, value);
|
||||
}
|
||||
|
||||
static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
|
||||
struct si_screen *screen,
|
||||
unsigned reg, unsigned idx,
|
||||
unsigned value)
|
||||
static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs, struct si_screen *screen,
|
||||
unsigned reg, unsigned idx, unsigned value)
|
||||
{
|
||||
assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
|
||||
assert(cs->current.cdw + 3 <= cs->current.max_dw);
|
||||
assert(idx != 0);
|
||||
unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
|
||||
if (screen->info.chip_class < GFX9 ||
|
||||
(screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
|
||||
opcode = PKT3_SET_UCONFIG_REG;
|
||||
radeon_emit(cs, PKT3(opcode, 1, 0));
|
||||
radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
|
||||
radeon_emit(cs, value);
|
||||
assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
|
||||
assert(cs->current.cdw + 3 <= cs->current.max_dw);
|
||||
assert(idx != 0);
|
||||
unsigned opcode = PKT3_SET_UCONFIG_REG_INDEX;
|
||||
if (screen->info.chip_class < GFX9 ||
|
||||
(screen->info.chip_class == GFX9 && screen->info.me_fw_version < 26))
|
||||
opcode = PKT3_SET_UCONFIG_REG;
|
||||
radeon_emit(cs, PKT3(opcode, 1, 0));
|
||||
radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2 | (idx << 28));
|
||||
radeon_emit(cs, value);
|
||||
}
|
||||
|
||||
static inline void radeon_set_context_reg_rmw(struct radeon_cmdbuf *cs, unsigned reg,
|
||||
unsigned value, unsigned mask)
|
||||
unsigned value, unsigned mask)
|
||||
{
|
||||
assert(reg >= SI_CONTEXT_REG_OFFSET);
|
||||
assert(cs->current.cdw + 4 <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
|
||||
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
|
||||
radeon_emit(cs, mask);
|
||||
radeon_emit(cs, value);
|
||||
assert(reg >= SI_CONTEXT_REG_OFFSET);
|
||||
assert(cs->current.cdw + 4 <= cs->current.max_dw);
|
||||
radeon_emit(cs, PKT3(PKT3_CONTEXT_REG_RMW, 2, 0));
|
||||
radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
|
||||
radeon_emit(cs, mask);
|
||||
radeon_emit(cs, value);
|
||||
}
|
||||
|
||||
/* Emit PKT3_CONTEXT_REG_RMW if the register value is different. */
|
||||
static inline void radeon_opt_set_context_reg_rmw(struct si_context *sctx, unsigned offset,
|
||||
enum si_tracked_reg reg, unsigned value,
|
||||
unsigned mask)
|
||||
enum si_tracked_reg reg, unsigned value,
|
||||
unsigned mask)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
|
||||
assert((value & ~mask) == 0);
|
||||
value &= mask;
|
||||
assert((value & ~mask) == 0);
|
||||
value &= mask;
|
||||
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
|
||||
sctx->tracked_regs.reg_value[reg] != value) {
|
||||
radeon_set_context_reg_rmw(cs, offset, value, mask);
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
|
||||
sctx->tracked_regs.reg_value[reg] != value) {
|
||||
radeon_set_context_reg_rmw(cs, offset, value, mask);
|
||||
|
||||
sctx->tracked_regs.reg_saved |= 0x1ull << reg;
|
||||
sctx->tracked_regs.reg_value[reg] = value;
|
||||
}
|
||||
sctx->tracked_regs.reg_saved |= 0x1ull << reg;
|
||||
sctx->tracked_regs.reg_value[reg] = value;
|
||||
}
|
||||
}
|
||||
|
||||
/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
|
||||
static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
|
||||
enum si_tracked_reg reg, unsigned value)
|
||||
enum si_tracked_reg reg, unsigned value)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
|
||||
sctx->tracked_regs.reg_value[reg] != value) {
|
||||
radeon_set_context_reg(cs, offset, value);
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0x1) != 0x1 ||
|
||||
sctx->tracked_regs.reg_value[reg] != value) {
|
||||
radeon_set_context_reg(cs, offset, value);
|
||||
|
||||
sctx->tracked_regs.reg_saved |= 0x1ull << reg;
|
||||
sctx->tracked_regs.reg_value[reg] = value;
|
||||
}
|
||||
sctx->tracked_regs.reg_saved |= 0x1ull << reg;
|
||||
sctx->tracked_regs.reg_value[reg] = value;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -168,98 +165,96 @@ static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned
|
|||
* @param value2 is written to second register
|
||||
*/
|
||||
static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
|
||||
enum si_tracked_reg reg, unsigned value1,
|
||||
unsigned value2)
|
||||
enum si_tracked_reg reg, unsigned value1,
|
||||
unsigned value2)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
|
||||
sctx->tracked_regs.reg_value[reg] != value1 ||
|
||||
sctx->tracked_regs.reg_value[reg+1] != value2) {
|
||||
radeon_set_context_reg_seq(cs, offset, 2);
|
||||
radeon_emit(cs, value1);
|
||||
radeon_emit(cs, value2);
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0x3) != 0x3 ||
|
||||
sctx->tracked_regs.reg_value[reg] != value1 ||
|
||||
sctx->tracked_regs.reg_value[reg + 1] != value2) {
|
||||
radeon_set_context_reg_seq(cs, offset, 2);
|
||||
radeon_emit(cs, value1);
|
||||
radeon_emit(cs, value2);
|
||||
|
||||
sctx->tracked_regs.reg_value[reg] = value1;
|
||||
sctx->tracked_regs.reg_value[reg+1] = value2;
|
||||
sctx->tracked_regs.reg_saved |= 0x3ull << reg;
|
||||
}
|
||||
sctx->tracked_regs.reg_value[reg] = value1;
|
||||
sctx->tracked_regs.reg_value[reg + 1] = value2;
|
||||
sctx->tracked_regs.reg_saved |= 0x3ull << reg;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set 3 consecutive registers if any registers value is different.
|
||||
*/
|
||||
static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset,
|
||||
enum si_tracked_reg reg, unsigned value1,
|
||||
unsigned value2, unsigned value3)
|
||||
enum si_tracked_reg reg, unsigned value1,
|
||||
unsigned value2, unsigned value3)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
|
||||
sctx->tracked_regs.reg_value[reg] != value1 ||
|
||||
sctx->tracked_regs.reg_value[reg+1] != value2 ||
|
||||
sctx->tracked_regs.reg_value[reg+2] != value3) {
|
||||
radeon_set_context_reg_seq(cs, offset, 3);
|
||||
radeon_emit(cs, value1);
|
||||
radeon_emit(cs, value2);
|
||||
radeon_emit(cs, value3);
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0x7) != 0x7 ||
|
||||
sctx->tracked_regs.reg_value[reg] != value1 ||
|
||||
sctx->tracked_regs.reg_value[reg + 1] != value2 ||
|
||||
sctx->tracked_regs.reg_value[reg + 2] != value3) {
|
||||
radeon_set_context_reg_seq(cs, offset, 3);
|
||||
radeon_emit(cs, value1);
|
||||
radeon_emit(cs, value2);
|
||||
radeon_emit(cs, value3);
|
||||
|
||||
sctx->tracked_regs.reg_value[reg] = value1;
|
||||
sctx->tracked_regs.reg_value[reg+1] = value2;
|
||||
sctx->tracked_regs.reg_value[reg+2] = value3;
|
||||
sctx->tracked_regs.reg_saved |= 0x7ull << reg;
|
||||
}
|
||||
sctx->tracked_regs.reg_value[reg] = value1;
|
||||
sctx->tracked_regs.reg_value[reg + 1] = value2;
|
||||
sctx->tracked_regs.reg_value[reg + 2] = value3;
|
||||
sctx->tracked_regs.reg_saved |= 0x7ull << reg;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set 4 consecutive registers if any registers value is different.
|
||||
*/
|
||||
static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset,
|
||||
enum si_tracked_reg reg, unsigned value1,
|
||||
unsigned value2, unsigned value3,
|
||||
unsigned value4)
|
||||
enum si_tracked_reg reg, unsigned value1,
|
||||
unsigned value2, unsigned value3, unsigned value4)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
|
||||
sctx->tracked_regs.reg_value[reg] != value1 ||
|
||||
sctx->tracked_regs.reg_value[reg+1] != value2 ||
|
||||
sctx->tracked_regs.reg_value[reg+2] != value3 ||
|
||||
sctx->tracked_regs.reg_value[reg+3] != value4) {
|
||||
radeon_set_context_reg_seq(cs, offset, 4);
|
||||
radeon_emit(cs, value1);
|
||||
radeon_emit(cs, value2);
|
||||
radeon_emit(cs, value3);
|
||||
radeon_emit(cs, value4);
|
||||
if (((sctx->tracked_regs.reg_saved >> reg) & 0xf) != 0xf ||
|
||||
sctx->tracked_regs.reg_value[reg] != value1 ||
|
||||
sctx->tracked_regs.reg_value[reg + 1] != value2 ||
|
||||
sctx->tracked_regs.reg_value[reg + 2] != value3 ||
|
||||
sctx->tracked_regs.reg_value[reg + 3] != value4) {
|
||||
radeon_set_context_reg_seq(cs, offset, 4);
|
||||
radeon_emit(cs, value1);
|
||||
radeon_emit(cs, value2);
|
||||
radeon_emit(cs, value3);
|
||||
radeon_emit(cs, value4);
|
||||
|
||||
sctx->tracked_regs.reg_value[reg] = value1;
|
||||
sctx->tracked_regs.reg_value[reg+1] = value2;
|
||||
sctx->tracked_regs.reg_value[reg+2] = value3;
|
||||
sctx->tracked_regs.reg_value[reg+3] = value4;
|
||||
sctx->tracked_regs.reg_saved |= 0xfull << reg;
|
||||
}
|
||||
sctx->tracked_regs.reg_value[reg] = value1;
|
||||
sctx->tracked_regs.reg_value[reg + 1] = value2;
|
||||
sctx->tracked_regs.reg_value[reg + 2] = value3;
|
||||
sctx->tracked_regs.reg_value[reg + 3] = value4;
|
||||
sctx->tracked_regs.reg_saved |= 0xfull << reg;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set consecutive registers if any registers value is different.
|
||||
*/
|
||||
static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset,
|
||||
unsigned *value, unsigned *saved_val,
|
||||
unsigned num)
|
||||
unsigned *value, unsigned *saved_val, unsigned num)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
int i, j;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < num; i++) {
|
||||
if (saved_val[i] != value[i]) {
|
||||
radeon_set_context_reg_seq(cs, offset, num);
|
||||
for (j = 0; j < num; j++)
|
||||
radeon_emit(cs, value[j]);
|
||||
for (i = 0; i < num; i++) {
|
||||
if (saved_val[i] != value[i]) {
|
||||
radeon_set_context_reg_seq(cs, offset, num);
|
||||
for (j = 0; j < num; j++)
|
||||
radeon_emit(cs, value[j]);
|
||||
|
||||
memcpy(saved_val, value, sizeof(uint32_t) * num);
|
||||
break;
|
||||
}
|
||||
}
|
||||
memcpy(saved_val, value, sizeof(uint32_t) * num);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -25,35 +25,33 @@
|
|||
#ifndef SI_COMPUTE_H
|
||||
#define SI_COMPUTE_H
|
||||
|
||||
#include "si_shader.h"
|
||||
#include "util/u_inlines.h"
|
||||
|
||||
#include "si_shader.h"
|
||||
|
||||
struct si_compute {
|
||||
struct si_shader_selector sel;
|
||||
struct si_shader shader;
|
||||
struct si_shader_selector sel;
|
||||
struct si_shader shader;
|
||||
|
||||
unsigned ir_type;
|
||||
unsigned local_size;
|
||||
unsigned private_size;
|
||||
unsigned input_size;
|
||||
unsigned ir_type;
|
||||
unsigned local_size;
|
||||
unsigned private_size;
|
||||
unsigned input_size;
|
||||
|
||||
int max_global_buffers;
|
||||
struct pipe_resource **global_buffers;
|
||||
int max_global_buffers;
|
||||
struct pipe_resource **global_buffers;
|
||||
|
||||
bool reads_variable_block_size;
|
||||
unsigned num_cs_user_data_dwords;
|
||||
bool reads_variable_block_size;
|
||||
unsigned num_cs_user_data_dwords;
|
||||
};
|
||||
|
||||
void si_destroy_compute(struct si_compute *program);
|
||||
|
||||
static inline void
|
||||
si_compute_reference(struct si_compute **dst, struct si_compute *src)
|
||||
static inline void si_compute_reference(struct si_compute **dst, struct si_compute *src)
|
||||
{
|
||||
if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
|
||||
si_destroy_compute(*dst);
|
||||
if (pipe_reference(&(*dst)->sel.base.reference, &src->sel.base.reference))
|
||||
si_destroy_compute(*dst);
|
||||
|
||||
*dst = src;
|
||||
*dst = src;
|
||||
}
|
||||
|
||||
#endif /* SI_COMPUTE_H */
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -1,9 +1,11 @@
|
|||
OPT_BOOL(aux_debug, false, "Generate ddebug_dumps for the auxiliary context")
|
||||
OPT_BOOL(sync_compile, false, "Always compile synchronously (will cause stalls)")
|
||||
OPT_BOOL(dump_shader_binary, false, "Dump shader binary as part of ddebug_dumps")
|
||||
OPT_BOOL(debug_disassembly, false, "Report shader disassembly as part of driver debug messages (for shader db)")
|
||||
OPT_BOOL(debug_disassembly, false,
|
||||
"Report shader disassembly as part of driver debug messages (for shader db)")
|
||||
OPT_BOOL(halt_shaders, false, "Halt shaders at the start (will hang)")
|
||||
OPT_BOOL(vs_fetch_always_opencode, false, "Always open code vertex fetches (less efficient, purely for testing)")
|
||||
OPT_BOOL(vs_fetch_always_opencode, false,
|
||||
"Always open code vertex fetches (less efficient, purely for testing)")
|
||||
OPT_BOOL(prim_restart_tri_strips_only, false, "Only enable primitive restart for triangle strips")
|
||||
|
||||
#undef OPT_BOOL
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -27,304 +27,279 @@
|
|||
|
||||
static void si_dma_emit_wait_idle(struct si_context *sctx)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
||||
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
||||
|
||||
/* NOP waits for idle. */
|
||||
if (sctx->chip_class >= GFX7)
|
||||
radeon_emit(cs, 0x00000000); /* NOP */
|
||||
else
|
||||
radeon_emit(cs, 0xf0000000); /* NOP */
|
||||
/* NOP waits for idle. */
|
||||
if (sctx->chip_class >= GFX7)
|
||||
radeon_emit(cs, 0x00000000); /* NOP */
|
||||
else
|
||||
radeon_emit(cs, 0xf0000000); /* NOP */
|
||||
}
|
||||
|
||||
void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst,
|
||||
uint64_t offset)
|
||||
void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
||||
uint64_t va = dst->gpu_address + offset;
|
||||
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
||||
uint64_t va = dst->gpu_address + offset;
|
||||
|
||||
if (sctx->chip_class == GFX6) {
|
||||
unreachable("SI DMA doesn't support the timestamp packet.");
|
||||
return;
|
||||
}
|
||||
if (sctx->chip_class == GFX6) {
|
||||
unreachable("SI DMA doesn't support the timestamp packet.");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Mark the buffer range of destination as valid (initialized),
|
||||
* so that transfer_map knows it should wait for the GPU when mapping
|
||||
* that range. */
|
||||
util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
|
||||
/* Mark the buffer range of destination as valid (initialized),
|
||||
* so that transfer_map knows it should wait for the GPU when mapping
|
||||
* that range. */
|
||||
util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
|
||||
|
||||
assert(va % 8 == 0);
|
||||
assert(va % 8 == 0);
|
||||
|
||||
si_need_dma_space(sctx, 4, dst, NULL);
|
||||
si_dma_emit_wait_idle(sctx);
|
||||
si_need_dma_space(sctx, 4, dst, NULL);
|
||||
si_dma_emit_wait_idle(sctx);
|
||||
|
||||
radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP,
|
||||
SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP,
|
||||
0));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(
|
||||
cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
}
|
||||
|
||||
void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
uint64_t offset, uint64_t size, unsigned clear_value)
|
||||
void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
|
||||
uint64_t size, unsigned clear_value)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
||||
unsigned i, ncopy, csize;
|
||||
struct si_resource *sdst = si_resource(dst);
|
||||
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
||||
unsigned i, ncopy, csize;
|
||||
struct si_resource *sdst = si_resource(dst);
|
||||
|
||||
assert(offset % 4 == 0);
|
||||
assert(size);
|
||||
assert(size % 4 == 0);
|
||||
assert(offset % 4 == 0);
|
||||
assert(size);
|
||||
assert(size % 4 == 0);
|
||||
|
||||
if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
|
||||
sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
|
||||
sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
|
||||
return;
|
||||
}
|
||||
if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
|
||||
sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS)) {
|
||||
sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Mark the buffer range of destination as valid (initialized),
|
||||
* so that transfer_map knows it should wait for the GPU when mapping
|
||||
* that range. */
|
||||
util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
|
||||
/* Mark the buffer range of destination as valid (initialized),
|
||||
* so that transfer_map knows it should wait for the GPU when mapping
|
||||
* that range. */
|
||||
util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
|
||||
|
||||
offset += sdst->gpu_address;
|
||||
offset += sdst->gpu_address;
|
||||
|
||||
if (sctx->chip_class == GFX6) {
|
||||
/* the same maximum size as for copying */
|
||||
ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
|
||||
si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
|
||||
if (sctx->chip_class == GFX6) {
|
||||
/* the same maximum size as for copying */
|
||||
ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
|
||||
si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
|
||||
|
||||
for (i = 0; i < ncopy; i++) {
|
||||
csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
|
||||
radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0,
|
||||
csize / 4));
|
||||
radeon_emit(cs, offset);
|
||||
radeon_emit(cs, clear_value);
|
||||
radeon_emit(cs, (offset >> 32) << 16);
|
||||
offset += csize;
|
||||
size -= csize;
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (i = 0; i < ncopy; i++) {
|
||||
csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
|
||||
radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4));
|
||||
radeon_emit(cs, offset);
|
||||
radeon_emit(cs, clear_value);
|
||||
radeon_emit(cs, (offset >> 32) << 16);
|
||||
offset += csize;
|
||||
size -= csize;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* The following code is for Sea Islands and later. */
|
||||
/* the same maximum size as for copying */
|
||||
ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
|
||||
si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
|
||||
/* The following code is for Sea Islands and later. */
|
||||
/* the same maximum size as for copying */
|
||||
ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
|
||||
si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
|
||||
|
||||
for (i = 0; i < ncopy; i++) {
|
||||
csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
|
||||
radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0,
|
||||
0x8000 /* dword copy */));
|
||||
radeon_emit(cs, offset);
|
||||
radeon_emit(cs, offset >> 32);
|
||||
radeon_emit(cs, clear_value);
|
||||
/* dw count */
|
||||
radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
|
||||
offset += csize;
|
||||
size -= csize;
|
||||
}
|
||||
for (i = 0; i < ncopy; i++) {
|
||||
csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
|
||||
radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */));
|
||||
radeon_emit(cs, offset);
|
||||
radeon_emit(cs, offset >> 32);
|
||||
radeon_emit(cs, clear_value);
|
||||
/* dw count */
|
||||
radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
|
||||
offset += csize;
|
||||
size -= csize;
|
||||
}
|
||||
}
|
||||
|
||||
void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
|
||||
struct pipe_resource *src, uint64_t dst_offset,
|
||||
uint64_t src_offset, uint64_t size)
|
||||
struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
|
||||
uint64_t size)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
||||
unsigned i, ncopy, csize;
|
||||
struct si_resource *sdst = si_resource(dst);
|
||||
struct si_resource *ssrc = si_resource(src);
|
||||
struct radeon_cmdbuf *cs = sctx->sdma_cs;
|
||||
unsigned i, ncopy, csize;
|
||||
struct si_resource *sdst = si_resource(dst);
|
||||
struct si_resource *ssrc = si_resource(src);
|
||||
|
||||
if (!cs ||
|
||||
dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
|
||||
src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
|
||||
si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
|
||||
return;
|
||||
}
|
||||
if (!cs || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE) {
|
||||
si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Mark the buffer range of destination as valid (initialized),
|
||||
* so that transfer_map knows it should wait for the GPU when mapping
|
||||
* that range. */
|
||||
util_range_add(dst, &sdst->valid_buffer_range, dst_offset,
|
||||
dst_offset + size);
|
||||
/* Mark the buffer range of destination as valid (initialized),
|
||||
* so that transfer_map knows it should wait for the GPU when mapping
|
||||
* that range. */
|
||||
util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size);
|
||||
|
||||
dst_offset += sdst->gpu_address;
|
||||
src_offset += ssrc->gpu_address;
|
||||
dst_offset += sdst->gpu_address;
|
||||
src_offset += ssrc->gpu_address;
|
||||
|
||||
if (sctx->chip_class == GFX6) {
|
||||
unsigned max_size, sub_cmd, shift;
|
||||
if (sctx->chip_class == GFX6) {
|
||||
unsigned max_size, sub_cmd, shift;
|
||||
|
||||
/* see whether we should use the dword-aligned or byte-aligned copy */
|
||||
if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
|
||||
sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
|
||||
shift = 2;
|
||||
max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
|
||||
} else {
|
||||
sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
|
||||
shift = 0;
|
||||
max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
|
||||
}
|
||||
/* see whether we should use the dword-aligned or byte-aligned copy */
|
||||
if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
|
||||
sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
|
||||
shift = 2;
|
||||
max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
|
||||
} else {
|
||||
sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
|
||||
shift = 0;
|
||||
max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
|
||||
}
|
||||
|
||||
ncopy = DIV_ROUND_UP(size, max_size);
|
||||
si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
|
||||
ncopy = DIV_ROUND_UP(size, max_size);
|
||||
si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
|
||||
|
||||
for (i = 0; i < ncopy; i++) {
|
||||
csize = MIN2(size, max_size);
|
||||
radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd,
|
||||
csize >> shift));
|
||||
radeon_emit(cs, dst_offset);
|
||||
radeon_emit(cs, src_offset);
|
||||
radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
|
||||
radeon_emit(cs, (src_offset >> 32UL) & 0xff);
|
||||
dst_offset += csize;
|
||||
src_offset += csize;
|
||||
size -= csize;
|
||||
}
|
||||
return;
|
||||
}
|
||||
for (i = 0; i < ncopy; i++) {
|
||||
csize = MIN2(size, max_size);
|
||||
radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift));
|
||||
radeon_emit(cs, dst_offset);
|
||||
radeon_emit(cs, src_offset);
|
||||
radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
|
||||
radeon_emit(cs, (src_offset >> 32UL) & 0xff);
|
||||
dst_offset += csize;
|
||||
src_offset += csize;
|
||||
size -= csize;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/* The following code is for CI and later. */
|
||||
unsigned align = ~0u;
|
||||
ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
|
||||
/* The following code is for CI and later. */
|
||||
unsigned align = ~0u;
|
||||
ncopy = DIV_ROUND_UP(size, CIK_SDMA_COPY_MAX_SIZE);
|
||||
|
||||
/* Align copy size to dw if src/dst address are dw aligned */
|
||||
if ((src_offset & 0x3) == 0 &&
|
||||
(dst_offset & 0x3) == 0 &&
|
||||
size > 4 &&
|
||||
(size & 3) != 0) {
|
||||
align = ~0x3u;
|
||||
ncopy++;
|
||||
}
|
||||
/* Align copy size to dw if src/dst address are dw aligned */
|
||||
if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) {
|
||||
align = ~0x3u;
|
||||
ncopy++;
|
||||
}
|
||||
|
||||
si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
|
||||
si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
|
||||
|
||||
for (i = 0; i < ncopy; i++) {
|
||||
csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
|
||||
radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
|
||||
CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
|
||||
0));
|
||||
radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
|
||||
radeon_emit(cs, 0); /* src/dst endian swap */
|
||||
radeon_emit(cs, src_offset);
|
||||
radeon_emit(cs, src_offset >> 32);
|
||||
radeon_emit(cs, dst_offset);
|
||||
radeon_emit(cs, dst_offset >> 32);
|
||||
dst_offset += csize;
|
||||
src_offset += csize;
|
||||
size -= csize;
|
||||
}
|
||||
for (i = 0; i < ncopy; i++) {
|
||||
csize = size >= 4 ? MIN2(size & align, CIK_SDMA_COPY_MAX_SIZE) : size;
|
||||
radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR, 0));
|
||||
radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
|
||||
radeon_emit(cs, 0); /* src/dst endian swap */
|
||||
radeon_emit(cs, src_offset);
|
||||
radeon_emit(cs, src_offset >> 32);
|
||||
radeon_emit(cs, dst_offset);
|
||||
radeon_emit(cs, dst_offset >> 32);
|
||||
dst_offset += csize;
|
||||
src_offset += csize;
|
||||
size -= csize;
|
||||
}
|
||||
}
|
||||
|
||||
void si_need_dma_space(struct si_context *ctx, unsigned num_dw,
|
||||
struct si_resource *dst, struct si_resource *src)
|
||||
void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
|
||||
struct si_resource *src)
|
||||
{
|
||||
struct radeon_winsys *ws = ctx->ws;
|
||||
uint64_t vram = ctx->sdma_cs->used_vram;
|
||||
uint64_t gtt = ctx->sdma_cs->used_gart;
|
||||
struct radeon_winsys *ws = ctx->ws;
|
||||
uint64_t vram = ctx->sdma_cs->used_vram;
|
||||
uint64_t gtt = ctx->sdma_cs->used_gart;
|
||||
|
||||
if (dst) {
|
||||
vram += dst->vram_usage;
|
||||
gtt += dst->gart_usage;
|
||||
}
|
||||
if (src) {
|
||||
vram += src->vram_usage;
|
||||
gtt += src->gart_usage;
|
||||
}
|
||||
if (dst) {
|
||||
vram += dst->vram_usage;
|
||||
gtt += dst->gart_usage;
|
||||
}
|
||||
if (src) {
|
||||
vram += src->vram_usage;
|
||||
gtt += src->gart_usage;
|
||||
}
|
||||
|
||||
/* Flush the GFX IB if DMA depends on it. */
|
||||
if (!ctx->sdma_uploads_in_progress &&
|
||||
radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
|
||||
((dst &&
|
||||
ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
|
||||
RADEON_USAGE_READWRITE)) ||
|
||||
(src &&
|
||||
ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
|
||||
RADEON_USAGE_WRITE))))
|
||||
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
||||
/* Flush the GFX IB if DMA depends on it. */
|
||||
if (!ctx->sdma_uploads_in_progress && radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
|
||||
((dst && ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
|
||||
(src && ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE))))
|
||||
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
||||
|
||||
/* Flush if there's not enough space, or if the memory usage per IB
|
||||
* is too large.
|
||||
*
|
||||
* IBs using too little memory are limited by the IB submission overhead.
|
||||
* IBs using too much memory are limited by the kernel/TTM overhead.
|
||||
* Too long IBs create CPU-GPU pipeline bubbles and add latency.
|
||||
*
|
||||
* This heuristic makes sure that DMA requests are executed
|
||||
* very soon after the call is made and lowers memory usage.
|
||||
* It improves texture upload performance by keeping the DMA
|
||||
* engine busy while uploads are being submitted.
|
||||
*/
|
||||
num_dw++; /* for emit_wait_idle below */
|
||||
if (!ctx->sdma_uploads_in_progress &&
|
||||
(!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
|
||||
ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
|
||||
!radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
|
||||
si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
|
||||
assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
|
||||
}
|
||||
/* Flush if there's not enough space, or if the memory usage per IB
|
||||
* is too large.
|
||||
*
|
||||
* IBs using too little memory are limited by the IB submission overhead.
|
||||
* IBs using too much memory are limited by the kernel/TTM overhead.
|
||||
* Too long IBs create CPU-GPU pipeline bubbles and add latency.
|
||||
*
|
||||
* This heuristic makes sure that DMA requests are executed
|
||||
* very soon after the call is made and lowers memory usage.
|
||||
* It improves texture upload performance by keeping the DMA
|
||||
* engine busy while uploads are being submitted.
|
||||
*/
|
||||
num_dw++; /* for emit_wait_idle below */
|
||||
if (!ctx->sdma_uploads_in_progress &&
|
||||
(!ws->cs_check_space(ctx->sdma_cs, num_dw, false) ||
|
||||
ctx->sdma_cs->used_vram + ctx->sdma_cs->used_gart > 64 * 1024 * 1024 ||
|
||||
!radeon_cs_memory_below_limit(ctx->screen, ctx->sdma_cs, vram, gtt))) {
|
||||
si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
|
||||
assert((num_dw + ctx->sdma_cs->current.cdw) <= ctx->sdma_cs->current.max_dw);
|
||||
}
|
||||
|
||||
/* Wait for idle if either buffer has been used in the IB before to
|
||||
* prevent read-after-write hazards.
|
||||
*/
|
||||
if ((dst &&
|
||||
ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf,
|
||||
RADEON_USAGE_READWRITE)) ||
|
||||
(src &&
|
||||
ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf,
|
||||
RADEON_USAGE_WRITE)))
|
||||
si_dma_emit_wait_idle(ctx);
|
||||
/* Wait for idle if either buffer has been used in the IB before to
|
||||
* prevent read-after-write hazards.
|
||||
*/
|
||||
if ((dst && ws->cs_is_buffer_referenced(ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
|
||||
(src && ws->cs_is_buffer_referenced(ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE)))
|
||||
si_dma_emit_wait_idle(ctx);
|
||||
|
||||
unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
|
||||
if (dst) {
|
||||
ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync,
|
||||
dst->domains, 0);
|
||||
}
|
||||
if (src) {
|
||||
ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync,
|
||||
src->domains, 0);
|
||||
}
|
||||
unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
|
||||
if (dst) {
|
||||
ws->cs_add_buffer(ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0);
|
||||
}
|
||||
if (src) {
|
||||
ws->cs_add_buffer(ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0);
|
||||
}
|
||||
|
||||
/* this function is called before all DMA calls, so increment this. */
|
||||
ctx->num_dma_calls++;
|
||||
/* this function is called before all DMA calls, so increment this. */
|
||||
ctx->num_dma_calls++;
|
||||
}
|
||||
|
||||
void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
|
||||
struct pipe_fence_handle **fence)
|
||||
void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = ctx->sdma_cs;
|
||||
struct radeon_saved_cs saved;
|
||||
bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
|
||||
struct radeon_cmdbuf *cs = ctx->sdma_cs;
|
||||
struct radeon_saved_cs saved;
|
||||
bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
|
||||
|
||||
if (!radeon_emitted(cs, 0)) {
|
||||
if (fence)
|
||||
ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
|
||||
return;
|
||||
}
|
||||
if (!radeon_emitted(cs, 0)) {
|
||||
if (fence)
|
||||
ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
|
||||
return;
|
||||
}
|
||||
|
||||
if (check_vm)
|
||||
si_save_cs(ctx->ws, cs, &saved, true);
|
||||
if (check_vm)
|
||||
si_save_cs(ctx->ws, cs, &saved, true);
|
||||
|
||||
ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
|
||||
if (fence)
|
||||
ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
|
||||
ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
|
||||
if (fence)
|
||||
ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
|
||||
|
||||
if (check_vm) {
|
||||
/* Use conservative timeout 800ms, after which we won't wait any
|
||||
* longer and assume the GPU is hung.
|
||||
*/
|
||||
ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800*1000*1000);
|
||||
if (check_vm) {
|
||||
/* Use conservative timeout 800ms, after which we won't wait any
|
||||
* longer and assume the GPU is hung.
|
||||
*/
|
||||
ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000);
|
||||
|
||||
si_check_vm_faults(ctx, &saved, RING_DMA);
|
||||
si_clear_saved_cs(&saved);
|
||||
}
|
||||
si_check_vm_faults(ctx, &saved, RING_DMA);
|
||||
si_clear_saved_cs(&saved);
|
||||
}
|
||||
}
|
||||
|
||||
void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst,
|
||||
uint64_t offset, uint64_t size, unsigned value)
|
||||
void si_screen_clear_buffer(struct si_screen *sscreen, struct pipe_resource *dst, uint64_t offset,
|
||||
uint64_t size, unsigned value)
|
||||
{
|
||||
struct si_context *ctx = (struct si_context*)sscreen->aux_context;
|
||||
struct si_context *ctx = (struct si_context *)sscreen->aux_context;
|
||||
|
||||
simple_mtx_lock(&sscreen->aux_context_lock);
|
||||
si_sdma_clear_buffer(ctx, dst, offset, size, value);
|
||||
sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
|
||||
simple_mtx_unlock(&sscreen->aux_context_lock);
|
||||
simple_mtx_lock(&sscreen->aux_context_lock);
|
||||
si_sdma_clear_buffer(ctx, dst, offset, size, value);
|
||||
sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
|
||||
simple_mtx_unlock(&sscreen->aux_context_lock);
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -23,516 +23,499 @@
|
|||
* USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "si_pipe.h"
|
||||
#include "si_build_pm4.h"
|
||||
#include "si_pipe.h"
|
||||
#include "sid.h"
|
||||
|
||||
#include "util/os_time.h"
|
||||
#include "util/u_upload_mgr.h"
|
||||
|
||||
/* initialize */
|
||||
void si_need_gfx_cs_space(struct si_context *ctx)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = ctx->gfx_cs;
|
||||
struct radeon_cmdbuf *cs = ctx->gfx_cs;
|
||||
|
||||
/* There is no need to flush the DMA IB here, because
|
||||
* si_need_dma_space always flushes the GFX IB if there is
|
||||
* a conflict, which means any unflushed DMA commands automatically
|
||||
* precede the GFX IB (= they had no dependency on the GFX IB when
|
||||
* they were submitted).
|
||||
*/
|
||||
/* There is no need to flush the DMA IB here, because
|
||||
* si_need_dma_space always flushes the GFX IB if there is
|
||||
* a conflict, which means any unflushed DMA commands automatically
|
||||
* precede the GFX IB (= they had no dependency on the GFX IB when
|
||||
* they were submitted).
|
||||
*/
|
||||
|
||||
/* There are two memory usage counters in the winsys for all buffers
|
||||
* that have been added (cs_add_buffer) and two counters in the pipe
|
||||
* driver for those that haven't been added yet.
|
||||
*/
|
||||
if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs,
|
||||
ctx->vram, ctx->gtt))) {
|
||||
ctx->gtt = 0;
|
||||
ctx->vram = 0;
|
||||
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
||||
return;
|
||||
}
|
||||
ctx->gtt = 0;
|
||||
ctx->vram = 0;
|
||||
/* There are two memory usage counters in the winsys for all buffers
|
||||
* that have been added (cs_add_buffer) and two counters in the pipe
|
||||
* driver for those that haven't been added yet.
|
||||
*/
|
||||
if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, ctx->vram, ctx->gtt))) {
|
||||
ctx->gtt = 0;
|
||||
ctx->vram = 0;
|
||||
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
||||
return;
|
||||
}
|
||||
ctx->gtt = 0;
|
||||
ctx->vram = 0;
|
||||
|
||||
unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
|
||||
if (!ctx->ws->cs_check_space(cs, need_dwords, false))
|
||||
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
||||
unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx);
|
||||
if (!ctx->ws->cs_check_space(cs, need_dwords, false))
|
||||
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
|
||||
}
|
||||
|
||||
void si_unref_sdma_uploads(struct si_context *sctx)
|
||||
{
|
||||
for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
|
||||
si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
|
||||
si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
|
||||
}
|
||||
sctx->num_sdma_uploads = 0;
|
||||
for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
|
||||
si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
|
||||
si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
|
||||
}
|
||||
sctx->num_sdma_uploads = 0;
|
||||
}
|
||||
|
||||
void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
|
||||
struct pipe_fence_handle **fence)
|
||||
void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = ctx->gfx_cs;
|
||||
struct radeon_winsys *ws = ctx->ws;
|
||||
const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
unsigned wait_flags = 0;
|
||||
struct radeon_cmdbuf *cs = ctx->gfx_cs;
|
||||
struct radeon_winsys *ws = ctx->ws;
|
||||
const unsigned wait_ps_cs = SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
unsigned wait_flags = 0;
|
||||
|
||||
if (ctx->gfx_flush_in_progress)
|
||||
return;
|
||||
if (ctx->gfx_flush_in_progress)
|
||||
return;
|
||||
|
||||
if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
|
||||
wait_flags |= wait_ps_cs |
|
||||
SI_CONTEXT_INV_L2;
|
||||
} else if (ctx->chip_class == GFX6) {
|
||||
/* The kernel flushes L2 before shaders are finished. */
|
||||
wait_flags |= wait_ps_cs;
|
||||
} else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
|
||||
wait_flags |= wait_ps_cs;
|
||||
}
|
||||
if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
|
||||
wait_flags |= wait_ps_cs | SI_CONTEXT_INV_L2;
|
||||
} else if (ctx->chip_class == GFX6) {
|
||||
/* The kernel flushes L2 before shaders are finished. */
|
||||
wait_flags |= wait_ps_cs;
|
||||
} else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
|
||||
wait_flags |= wait_ps_cs;
|
||||
}
|
||||
|
||||
/* Drop this flush if it's a no-op. */
|
||||
if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
|
||||
(!wait_flags || !ctx->gfx_last_ib_is_busy))
|
||||
return;
|
||||
/* Drop this flush if it's a no-op. */
|
||||
if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && (!wait_flags || !ctx->gfx_last_ib_is_busy))
|
||||
return;
|
||||
|
||||
if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
|
||||
return;
|
||||
if (ctx->b.get_device_reset_status(&ctx->b) != PIPE_NO_RESET)
|
||||
return;
|
||||
|
||||
if (ctx->screen->debug_flags & DBG(CHECK_VM))
|
||||
flags &= ~PIPE_FLUSH_ASYNC;
|
||||
if (ctx->screen->debug_flags & DBG(CHECK_VM))
|
||||
flags &= ~PIPE_FLUSH_ASYNC;
|
||||
|
||||
ctx->gfx_flush_in_progress = true;
|
||||
ctx->gfx_flush_in_progress = true;
|
||||
|
||||
/* If the state tracker is flushing the GFX IB, si_flush_from_st is
|
||||
* responsible for flushing the DMA IB and merging the fences from both.
|
||||
* If the driver flushes the GFX IB internally, and it should never ask
|
||||
* for a fence handle.
|
||||
*/
|
||||
assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);
|
||||
/* If the state tracker is flushing the GFX IB, si_flush_from_st is
|
||||
* responsible for flushing the DMA IB and merging the fences from both.
|
||||
* If the driver flushes the GFX IB internally, and it should never ask
|
||||
* for a fence handle.
|
||||
*/
|
||||
assert(!radeon_emitted(ctx->sdma_cs, 0) || fence == NULL);
|
||||
|
||||
/* Update the sdma_uploads list by flushing the uploader. */
|
||||
u_upload_unmap(ctx->b.const_uploader);
|
||||
/* Update the sdma_uploads list by flushing the uploader. */
|
||||
u_upload_unmap(ctx->b.const_uploader);
|
||||
|
||||
/* Execute SDMA uploads. */
|
||||
ctx->sdma_uploads_in_progress = true;
|
||||
for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
|
||||
struct si_sdma_upload *up = &ctx->sdma_uploads[i];
|
||||
/* Execute SDMA uploads. */
|
||||
ctx->sdma_uploads_in_progress = true;
|
||||
for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
|
||||
struct si_sdma_upload *up = &ctx->sdma_uploads[i];
|
||||
|
||||
assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
|
||||
up->size % 4 == 0);
|
||||
assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && up->size % 4 == 0);
|
||||
|
||||
si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b,
|
||||
up->dst_offset, up->src_offset, up->size);
|
||||
}
|
||||
ctx->sdma_uploads_in_progress = false;
|
||||
si_unref_sdma_uploads(ctx);
|
||||
si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, up->dst_offset, up->src_offset,
|
||||
up->size);
|
||||
}
|
||||
ctx->sdma_uploads_in_progress = false;
|
||||
si_unref_sdma_uploads(ctx);
|
||||
|
||||
/* Flush SDMA (preamble IB). */
|
||||
if (radeon_emitted(ctx->sdma_cs, 0))
|
||||
si_flush_dma_cs(ctx, flags, NULL);
|
||||
/* Flush SDMA (preamble IB). */
|
||||
if (radeon_emitted(ctx->sdma_cs, 0))
|
||||
si_flush_dma_cs(ctx, flags, NULL);
|
||||
|
||||
if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
|
||||
struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
|
||||
si_compute_signal_gfx(ctx);
|
||||
if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
|
||||
struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
|
||||
si_compute_signal_gfx(ctx);
|
||||
|
||||
/* Make sure compute shaders are idle before leaving the IB, so that
|
||||
* the next IB doesn't overwrite GDS that might be in use. */
|
||||
radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) |
|
||||
EVENT_INDEX(4));
|
||||
/* Make sure compute shaders are idle before leaving the IB, so that
|
||||
* the next IB doesn't overwrite GDS that might be in use. */
|
||||
radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) | EVENT_INDEX(4));
|
||||
|
||||
/* Save the GDS prim restart counter if needed. */
|
||||
if (ctx->preserve_prim_restart_gds_at_flush) {
|
||||
si_cp_copy_data(ctx, compute_cs,
|
||||
COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
|
||||
COPY_DATA_GDS, NULL, 4);
|
||||
}
|
||||
}
|
||||
/* Save the GDS prim restart counter if needed. */
|
||||
if (ctx->preserve_prim_restart_gds_at_flush) {
|
||||
si_cp_copy_data(ctx, compute_cs, COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
|
||||
COPY_DATA_GDS, NULL, 4);
|
||||
}
|
||||
}
|
||||
|
||||
if (ctx->has_graphics) {
|
||||
if (!list_is_empty(&ctx->active_queries))
|
||||
si_suspend_queries(ctx);
|
||||
if (ctx->has_graphics) {
|
||||
if (!list_is_empty(&ctx->active_queries))
|
||||
si_suspend_queries(ctx);
|
||||
|
||||
ctx->streamout.suspended = false;
|
||||
if (ctx->streamout.begin_emitted) {
|
||||
si_emit_streamout_end(ctx);
|
||||
ctx->streamout.suspended = true;
|
||||
ctx->streamout.suspended = false;
|
||||
if (ctx->streamout.begin_emitted) {
|
||||
si_emit_streamout_end(ctx);
|
||||
ctx->streamout.suspended = true;
|
||||
|
||||
/* Since NGG streamout uses GDS, we need to make GDS
|
||||
* idle when we leave the IB, otherwise another process
|
||||
* might overwrite it while our shaders are busy.
|
||||
*/
|
||||
if (ctx->screen->use_ngg_streamout)
|
||||
wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
}
|
||||
}
|
||||
/* Since NGG streamout uses GDS, we need to make GDS
|
||||
* idle when we leave the IB, otherwise another process
|
||||
* might overwrite it while our shaders are busy.
|
||||
*/
|
||||
if (ctx->screen->use_ngg_streamout)
|
||||
wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
}
|
||||
}
|
||||
|
||||
/* Make sure CP DMA is idle at the end of IBs after L2 prefetches
|
||||
* because the kernel doesn't wait for it. */
|
||||
if (ctx->chip_class >= GFX7)
|
||||
si_cp_dma_wait_for_idle(ctx);
|
||||
/* Make sure CP DMA is idle at the end of IBs after L2 prefetches
|
||||
* because the kernel doesn't wait for it. */
|
||||
if (ctx->chip_class >= GFX7)
|
||||
si_cp_dma_wait_for_idle(ctx);
|
||||
|
||||
/* Wait for draw calls to finish if needed. */
|
||||
if (wait_flags) {
|
||||
ctx->flags |= wait_flags;
|
||||
ctx->emit_cache_flush(ctx);
|
||||
}
|
||||
ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
|
||||
/* Wait for draw calls to finish if needed. */
|
||||
if (wait_flags) {
|
||||
ctx->flags |= wait_flags;
|
||||
ctx->emit_cache_flush(ctx);
|
||||
}
|
||||
ctx->gfx_last_ib_is_busy = (wait_flags & wait_ps_cs) != wait_ps_cs;
|
||||
|
||||
if (ctx->current_saved_cs) {
|
||||
si_trace_emit(ctx);
|
||||
if (ctx->current_saved_cs) {
|
||||
si_trace_emit(ctx);
|
||||
|
||||
/* Save the IB for debug contexts. */
|
||||
si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
|
||||
ctx->current_saved_cs->flushed = true;
|
||||
ctx->current_saved_cs->time_flush = os_time_get_nano();
|
||||
/* Save the IB for debug contexts. */
|
||||
si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true);
|
||||
ctx->current_saved_cs->flushed = true;
|
||||
ctx->current_saved_cs->time_flush = os_time_get_nano();
|
||||
|
||||
si_log_hw_flush(ctx);
|
||||
}
|
||||
si_log_hw_flush(ctx);
|
||||
}
|
||||
|
||||
if (si_compute_prim_discard_enabled(ctx)) {
|
||||
/* The compute IB can start after the previous gfx IB starts. */
|
||||
if (radeon_emitted(ctx->prim_discard_compute_cs, 0) &&
|
||||
ctx->last_gfx_fence) {
|
||||
ctx->ws->cs_add_fence_dependency(ctx->gfx_cs,
|
||||
ctx->last_gfx_fence,
|
||||
RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY |
|
||||
RADEON_DEPENDENCY_START_FENCE);
|
||||
}
|
||||
if (si_compute_prim_discard_enabled(ctx)) {
|
||||
/* The compute IB can start after the previous gfx IB starts. */
|
||||
if (radeon_emitted(ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) {
|
||||
ctx->ws->cs_add_fence_dependency(
|
||||
ctx->gfx_cs, ctx->last_gfx_fence,
|
||||
RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE);
|
||||
}
|
||||
|
||||
/* Remember the last execution barrier. It's in the IB.
|
||||
* It will signal the start of the next compute IB.
|
||||
*/
|
||||
if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW &&
|
||||
ctx->last_pkt3_write_data) {
|
||||
*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
|
||||
ctx->last_pkt3_write_data = NULL;
|
||||
/* Remember the last execution barrier. It's in the IB.
|
||||
* It will signal the start of the next compute IB.
|
||||
*/
|
||||
if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) {
|
||||
*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
|
||||
ctx->last_pkt3_write_data = NULL;
|
||||
|
||||
si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
|
||||
ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
|
||||
si_resource_reference(&ctx->barrier_buf, NULL);
|
||||
si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
|
||||
ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
|
||||
si_resource_reference(&ctx->barrier_buf, NULL);
|
||||
|
||||
ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
|
||||
}
|
||||
}
|
||||
ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
/* Flush the CS. */
|
||||
ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
|
||||
if (fence)
|
||||
ws->fence_reference(fence, ctx->last_gfx_fence);
|
||||
/* Flush the CS. */
|
||||
ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
|
||||
if (fence)
|
||||
ws->fence_reference(fence, ctx->last_gfx_fence);
|
||||
|
||||
ctx->num_gfx_cs_flushes++;
|
||||
ctx->num_gfx_cs_flushes++;
|
||||
|
||||
if (si_compute_prim_discard_enabled(ctx)) {
|
||||
/* Remember the last execution barrier, which is the last fence
|
||||
* in this case.
|
||||
*/
|
||||
if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
|
||||
ctx->last_pkt3_write_data = NULL;
|
||||
si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
|
||||
ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
|
||||
}
|
||||
}
|
||||
if (si_compute_prim_discard_enabled(ctx)) {
|
||||
/* Remember the last execution barrier, which is the last fence
|
||||
* in this case.
|
||||
*/
|
||||
if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
|
||||
ctx->last_pkt3_write_data = NULL;
|
||||
si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
|
||||
ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
|
||||
}
|
||||
}
|
||||
|
||||
/* Check VM faults if needed. */
|
||||
if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
|
||||
/* Use conservative timeout 800ms, after which we won't wait any
|
||||
* longer and assume the GPU is hung.
|
||||
*/
|
||||
ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800*1000*1000);
|
||||
/* Check VM faults if needed. */
|
||||
if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
|
||||
/* Use conservative timeout 800ms, after which we won't wait any
|
||||
* longer and assume the GPU is hung.
|
||||
*/
|
||||
ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800 * 1000 * 1000);
|
||||
|
||||
si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
|
||||
}
|
||||
si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX);
|
||||
}
|
||||
|
||||
if (ctx->current_saved_cs)
|
||||
si_saved_cs_reference(&ctx->current_saved_cs, NULL);
|
||||
if (ctx->current_saved_cs)
|
||||
si_saved_cs_reference(&ctx->current_saved_cs, NULL);
|
||||
|
||||
si_begin_new_gfx_cs(ctx);
|
||||
ctx->gfx_flush_in_progress = false;
|
||||
si_begin_new_gfx_cs(ctx);
|
||||
ctx->gfx_flush_in_progress = false;
|
||||
}
|
||||
|
||||
static void si_begin_gfx_cs_debug(struct si_context *ctx)
|
||||
{
|
||||
static const uint32_t zeros[1];
|
||||
assert(!ctx->current_saved_cs);
|
||||
static const uint32_t zeros[1];
|
||||
assert(!ctx->current_saved_cs);
|
||||
|
||||
ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
|
||||
if (!ctx->current_saved_cs)
|
||||
return;
|
||||
ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs));
|
||||
if (!ctx->current_saved_cs)
|
||||
return;
|
||||
|
||||
pipe_reference_init(&ctx->current_saved_cs->reference, 1);
|
||||
pipe_reference_init(&ctx->current_saved_cs->reference, 1);
|
||||
|
||||
ctx->current_saved_cs->trace_buf = si_resource(
|
||||
pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
|
||||
if (!ctx->current_saved_cs->trace_buf) {
|
||||
free(ctx->current_saved_cs);
|
||||
ctx->current_saved_cs = NULL;
|
||||
return;
|
||||
}
|
||||
ctx->current_saved_cs->trace_buf =
|
||||
si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
|
||||
if (!ctx->current_saved_cs->trace_buf) {
|
||||
free(ctx->current_saved_cs);
|
||||
ctx->current_saved_cs = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b,
|
||||
0, sizeof(zeros), zeros);
|
||||
ctx->current_saved_cs->trace_id = 0;
|
||||
pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, 0, sizeof(zeros),
|
||||
zeros);
|
||||
ctx->current_saved_cs->trace_id = 0;
|
||||
|
||||
si_trace_emit(ctx);
|
||||
si_trace_emit(ctx);
|
||||
|
||||
radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
|
||||
RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
|
||||
radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf,
|
||||
RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
|
||||
}
|
||||
|
||||
static void si_add_gds_to_buffer_list(struct si_context *sctx)
|
||||
{
|
||||
if (sctx->gds) {
|
||||
sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
|
||||
RADEON_USAGE_READWRITE, 0, 0);
|
||||
if (sctx->gds_oa) {
|
||||
sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
|
||||
RADEON_USAGE_READWRITE, 0, 0);
|
||||
}
|
||||
}
|
||||
if (sctx->gds) {
|
||||
sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds, RADEON_USAGE_READWRITE, 0, 0);
|
||||
if (sctx->gds_oa) {
|
||||
sctx->ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa, RADEON_USAGE_READWRITE, 0, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void si_allocate_gds(struct si_context *sctx)
|
||||
{
|
||||
struct radeon_winsys *ws = sctx->ws;
|
||||
struct radeon_winsys *ws = sctx->ws;
|
||||
|
||||
if (sctx->gds)
|
||||
return;
|
||||
if (sctx->gds)
|
||||
return;
|
||||
|
||||
assert(sctx->screen->use_ngg_streamout);
|
||||
assert(sctx->screen->use_ngg_streamout);
|
||||
|
||||
/* 4 streamout GDS counters.
|
||||
* We need 256B (64 dw) of GDS, otherwise streamout hangs.
|
||||
*/
|
||||
sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
|
||||
sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);
|
||||
/* 4 streamout GDS counters.
|
||||
* We need 256B (64 dw) of GDS, otherwise streamout hangs.
|
||||
*/
|
||||
sctx->gds = ws->buffer_create(ws, 256, 4, RADEON_DOMAIN_GDS, 0);
|
||||
sctx->gds_oa = ws->buffer_create(ws, 4, 1, RADEON_DOMAIN_OA, 0);
|
||||
|
||||
assert(sctx->gds && sctx->gds_oa);
|
||||
si_add_gds_to_buffer_list(sctx);
|
||||
assert(sctx->gds && sctx->gds_oa);
|
||||
si_add_gds_to_buffer_list(sctx);
|
||||
}
|
||||
|
||||
void si_begin_new_gfx_cs(struct si_context *ctx)
|
||||
{
|
||||
if (ctx->is_debug)
|
||||
si_begin_gfx_cs_debug(ctx);
|
||||
if (ctx->is_debug)
|
||||
si_begin_gfx_cs_debug(ctx);
|
||||
|
||||
si_add_gds_to_buffer_list(ctx);
|
||||
si_add_gds_to_buffer_list(ctx);
|
||||
|
||||
/* Always invalidate caches at the beginning of IBs, because external
|
||||
* users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
|
||||
* buffers.
|
||||
*
|
||||
* Note that the cache flush done by the kernel at the end of GFX IBs
|
||||
* isn't useful here, because that flush can finish after the following
|
||||
* IB starts drawing.
|
||||
*
|
||||
* TODO: Do we also need to invalidate CB & DB caches?
|
||||
*/
|
||||
ctx->flags |= SI_CONTEXT_INV_ICACHE |
|
||||
SI_CONTEXT_INV_SCACHE |
|
||||
SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2 |
|
||||
SI_CONTEXT_START_PIPELINE_STATS;
|
||||
/* Always invalidate caches at the beginning of IBs, because external
|
||||
* users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
|
||||
* buffers.
|
||||
*
|
||||
* Note that the cache flush done by the kernel at the end of GFX IBs
|
||||
* isn't useful here, because that flush can finish after the following
|
||||
* IB starts drawing.
|
||||
*
|
||||
* TODO: Do we also need to invalidate CB & DB caches?
|
||||
*/
|
||||
ctx->flags |= SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_L2 | SI_CONTEXT_START_PIPELINE_STATS;
|
||||
|
||||
ctx->cs_shader_state.initialized = false;
|
||||
si_all_descriptors_begin_new_cs(ctx);
|
||||
ctx->cs_shader_state.initialized = false;
|
||||
si_all_descriptors_begin_new_cs(ctx);
|
||||
|
||||
if (!ctx->has_graphics) {
|
||||
ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
|
||||
return;
|
||||
}
|
||||
if (!ctx->has_graphics) {
|
||||
ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
|
||||
return;
|
||||
}
|
||||
|
||||
/* set all valid group as dirty so they get reemited on
|
||||
* next draw command
|
||||
*/
|
||||
si_pm4_reset_emitted(ctx);
|
||||
/* set all valid group as dirty so they get reemited on
|
||||
* next draw command
|
||||
*/
|
||||
si_pm4_reset_emitted(ctx);
|
||||
|
||||
/* The CS initialization should be emitted before everything else. */
|
||||
si_pm4_emit(ctx, ctx->init_config);
|
||||
if (ctx->init_config_gs_rings)
|
||||
si_pm4_emit(ctx, ctx->init_config_gs_rings);
|
||||
/* The CS initialization should be emitted before everything else. */
|
||||
si_pm4_emit(ctx, ctx->init_config);
|
||||
if (ctx->init_config_gs_rings)
|
||||
si_pm4_emit(ctx, ctx->init_config_gs_rings);
|
||||
|
||||
if (ctx->queued.named.ls)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
|
||||
if (ctx->queued.named.hs)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
|
||||
if (ctx->queued.named.es)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
|
||||
if (ctx->queued.named.gs)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
|
||||
if (ctx->queued.named.vs)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
|
||||
if (ctx->queued.named.ps)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
|
||||
if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
|
||||
if (ctx->queued.named.ls)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_LS;
|
||||
if (ctx->queued.named.hs)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_HS;
|
||||
if (ctx->queued.named.es)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_ES;
|
||||
if (ctx->queued.named.gs)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_GS;
|
||||
if (ctx->queued.named.vs)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_VS;
|
||||
if (ctx->queued.named.ps)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_PS;
|
||||
if (ctx->vb_descriptors_buffer && ctx->vertex_elements)
|
||||
ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
|
||||
|
||||
/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
|
||||
bool has_clear_state = ctx->screen->info.has_clear_state;
|
||||
if (has_clear_state) {
|
||||
ctx->framebuffer.dirty_cbufs =
|
||||
u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
|
||||
/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
|
||||
ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
|
||||
} else {
|
||||
ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
|
||||
ctx->framebuffer.dirty_zsbuf = true;
|
||||
}
|
||||
/* This should always be marked as dirty to set the framebuffer scissor
|
||||
* at least. */
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
|
||||
/* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */
|
||||
bool has_clear_state = ctx->screen->info.has_clear_state;
|
||||
if (has_clear_state) {
|
||||
ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs);
|
||||
/* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */
|
||||
ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL;
|
||||
} else {
|
||||
ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8);
|
||||
ctx->framebuffer.dirty_zsbuf = true;
|
||||
}
|
||||
/* This should always be marked as dirty to set the framebuffer scissor
|
||||
* at least. */
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
|
||||
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
|
||||
/* CLEAR_STATE sets zeros. */
|
||||
if (!has_clear_state || ctx->clip_state.any_nonzeros)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
|
||||
ctx->sample_locs_num_samples = 0;
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
|
||||
/* CLEAR_STATE sets 0xffff. */
|
||||
if (!has_clear_state || ctx->sample_mask != 0xffff)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
|
||||
/* CLEAR_STATE sets zeros. */
|
||||
if (!has_clear_state || ctx->blend_color.any_nonzeros)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
|
||||
if (ctx->chip_class >= GFX9)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
|
||||
if (!ctx->screen->use_ngg_streamout)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
|
||||
/* CLEAR_STATE disables all window rectangles. */
|
||||
if (!has_clear_state || ctx->num_window_rectangles > 0)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
|
||||
/* CLEAR_STATE sets zeros. */
|
||||
if (!has_clear_state || ctx->clip_state.any_nonzeros)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
|
||||
ctx->sample_locs_num_samples = 0;
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
|
||||
/* CLEAR_STATE sets 0xffff. */
|
||||
if (!has_clear_state || ctx->sample_mask != 0xffff)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
|
||||
/* CLEAR_STATE sets zeros. */
|
||||
if (!has_clear_state || ctx->blend_color.any_nonzeros)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
|
||||
if (ctx->chip_class >= GFX9)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
|
||||
if (!ctx->screen->use_ngg_streamout)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
|
||||
/* CLEAR_STATE disables all window rectangles. */
|
||||
if (!has_clear_state || ctx->num_window_rectangles > 0)
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles);
|
||||
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
|
||||
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
|
||||
if (ctx->scratch_buffer) {
|
||||
si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
|
||||
}
|
||||
si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
|
||||
if (ctx->scratch_buffer) {
|
||||
si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
|
||||
}
|
||||
|
||||
if (ctx->streamout.suspended) {
|
||||
ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
|
||||
si_streamout_buffers_dirty(ctx);
|
||||
}
|
||||
if (ctx->streamout.suspended) {
|
||||
ctx->streamout.append_bitmask = ctx->streamout.enabled_mask;
|
||||
si_streamout_buffers_dirty(ctx);
|
||||
}
|
||||
|
||||
if (!list_is_empty(&ctx->active_queries))
|
||||
si_resume_queries(ctx);
|
||||
if (!list_is_empty(&ctx->active_queries))
|
||||
si_resume_queries(ctx);
|
||||
|
||||
assert(!ctx->gfx_cs->prev_dw);
|
||||
ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
|
||||
assert(!ctx->gfx_cs->prev_dw);
|
||||
ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw;
|
||||
|
||||
/* Invalidate various draw states so that they are emitted before
|
||||
* the first draw call. */
|
||||
si_invalidate_draw_sh_constants(ctx);
|
||||
ctx->last_index_size = -1;
|
||||
ctx->last_primitive_restart_en = -1;
|
||||
ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
|
||||
ctx->last_prim = -1;
|
||||
ctx->last_multi_vgt_param = -1;
|
||||
ctx->last_vs_state = ~0;
|
||||
ctx->last_ls = NULL;
|
||||
ctx->last_tcs = NULL;
|
||||
ctx->last_tes_sh_base = -1;
|
||||
ctx->last_num_tcs_input_cp = -1;
|
||||
ctx->last_ls_hs_config = -1; /* impossible value */
|
||||
ctx->last_binning_enabled = -1;
|
||||
ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
|
||||
/* Invalidate various draw states so that they are emitted before
|
||||
* the first draw call. */
|
||||
si_invalidate_draw_sh_constants(ctx);
|
||||
ctx->last_index_size = -1;
|
||||
ctx->last_primitive_restart_en = -1;
|
||||
ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
|
||||
ctx->last_prim = -1;
|
||||
ctx->last_multi_vgt_param = -1;
|
||||
ctx->last_vs_state = ~0;
|
||||
ctx->last_ls = NULL;
|
||||
ctx->last_tcs = NULL;
|
||||
ctx->last_tes_sh_base = -1;
|
||||
ctx->last_num_tcs_input_cp = -1;
|
||||
ctx->last_ls_hs_config = -1; /* impossible value */
|
||||
ctx->last_binning_enabled = -1;
|
||||
ctx->small_prim_cull_info_dirty = ctx->small_prim_cull_info_buf != NULL;
|
||||
|
||||
ctx->prim_discard_compute_ib_initialized = false;
|
||||
ctx->prim_discard_compute_ib_initialized = false;
|
||||
|
||||
/* Compute-based primitive discard:
|
||||
* The index ring is divided into 2 halves. Switch between the halves
|
||||
* in the same fashion as doublebuffering.
|
||||
*/
|
||||
if (ctx->index_ring_base)
|
||||
ctx->index_ring_base = 0;
|
||||
else
|
||||
ctx->index_ring_base = ctx->index_ring_size_per_ib;
|
||||
/* Compute-based primitive discard:
|
||||
* The index ring is divided into 2 halves. Switch between the halves
|
||||
* in the same fashion as doublebuffering.
|
||||
*/
|
||||
if (ctx->index_ring_base)
|
||||
ctx->index_ring_base = 0;
|
||||
else
|
||||
ctx->index_ring_base = ctx->index_ring_size_per_ib;
|
||||
|
||||
ctx->index_ring_offset = 0;
|
||||
ctx->index_ring_offset = 0;
|
||||
|
||||
STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
|
||||
STATIC_ASSERT(SI_NUM_TRACKED_REGS <= sizeof(ctx->tracked_regs.reg_saved) * 8);
|
||||
|
||||
if (has_clear_state) {
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From GFX8 */
|
||||
if (has_clear_state) {
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__VS] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL__CL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_STIPPLE] = 0;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_GE_NGG_SUBGRP_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_IDX_FORMAT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_NGG_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000;
|
||||
ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] =
|
||||
0x0000001e; /* From GFX8 */
|
||||
|
||||
/* Set all cleared context registers to saved. */
|
||||
ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
|
||||
ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
|
||||
} else {
|
||||
/* Set all register values to unknown. */
|
||||
ctx->tracked_regs.reg_saved = 0;
|
||||
ctx->last_gs_out_prim = -1; /* unknown */
|
||||
}
|
||||
/* Set all cleared context registers to saved. */
|
||||
ctx->tracked_regs.reg_saved = ~(1ull << SI_TRACKED_GE_PC_ALLOC); /* uconfig reg */
|
||||
ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
|
||||
} else {
|
||||
/* Set all register values to unknown. */
|
||||
ctx->tracked_regs.reg_saved = 0;
|
||||
ctx->last_gs_out_prim = -1; /* unknown */
|
||||
}
|
||||
|
||||
/* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
|
||||
memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
|
||||
/* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
|
||||
memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,242 +40,234 @@
|
|||
* fps (there are too few samples per frame). */
|
||||
#define SAMPLES_PER_SEC 10000
|
||||
|
||||
#define GRBM_STATUS 0x8010
|
||||
#define TA_BUSY(x) (((x) >> 14) & 0x1)
|
||||
#define GDS_BUSY(x) (((x) >> 15) & 0x1)
|
||||
#define VGT_BUSY(x) (((x) >> 17) & 0x1)
|
||||
#define IA_BUSY(x) (((x) >> 19) & 0x1)
|
||||
#define SX_BUSY(x) (((x) >> 20) & 0x1)
|
||||
#define WD_BUSY(x) (((x) >> 21) & 0x1)
|
||||
#define SPI_BUSY(x) (((x) >> 22) & 0x1)
|
||||
#define BCI_BUSY(x) (((x) >> 23) & 0x1)
|
||||
#define SC_BUSY(x) (((x) >> 24) & 0x1)
|
||||
#define PA_BUSY(x) (((x) >> 25) & 0x1)
|
||||
#define DB_BUSY(x) (((x) >> 26) & 0x1)
|
||||
#define CP_BUSY(x) (((x) >> 29) & 0x1)
|
||||
#define CB_BUSY(x) (((x) >> 30) & 0x1)
|
||||
#define GUI_ACTIVE(x) (((x) >> 31) & 0x1)
|
||||
#define GRBM_STATUS 0x8010
|
||||
#define TA_BUSY(x) (((x) >> 14) & 0x1)
|
||||
#define GDS_BUSY(x) (((x) >> 15) & 0x1)
|
||||
#define VGT_BUSY(x) (((x) >> 17) & 0x1)
|
||||
#define IA_BUSY(x) (((x) >> 19) & 0x1)
|
||||
#define SX_BUSY(x) (((x) >> 20) & 0x1)
|
||||
#define WD_BUSY(x) (((x) >> 21) & 0x1)
|
||||
#define SPI_BUSY(x) (((x) >> 22) & 0x1)
|
||||
#define BCI_BUSY(x) (((x) >> 23) & 0x1)
|
||||
#define SC_BUSY(x) (((x) >> 24) & 0x1)
|
||||
#define PA_BUSY(x) (((x) >> 25) & 0x1)
|
||||
#define DB_BUSY(x) (((x) >> 26) & 0x1)
|
||||
#define CP_BUSY(x) (((x) >> 29) & 0x1)
|
||||
#define CB_BUSY(x) (((x) >> 30) & 0x1)
|
||||
#define GUI_ACTIVE(x) (((x) >> 31) & 0x1)
|
||||
|
||||
#define SRBM_STATUS2 0x0e4c
|
||||
#define SDMA_BUSY(x) (((x) >> 5) & 0x1)
|
||||
#define SRBM_STATUS2 0x0e4c
|
||||
#define SDMA_BUSY(x) (((x) >> 5) & 0x1)
|
||||
|
||||
#define CP_STAT 0x8680
|
||||
#define PFP_BUSY(x) (((x) >> 15) & 0x1)
|
||||
#define MEQ_BUSY(x) (((x) >> 16) & 0x1)
|
||||
#define ME_BUSY(x) (((x) >> 17) & 0x1)
|
||||
#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1)
|
||||
#define DMA_BUSY(x) (((x) >> 22) & 0x1)
|
||||
#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1)
|
||||
#define CP_STAT 0x8680
|
||||
#define PFP_BUSY(x) (((x) >> 15) & 0x1)
|
||||
#define MEQ_BUSY(x) (((x) >> 16) & 0x1)
|
||||
#define ME_BUSY(x) (((x) >> 17) & 0x1)
|
||||
#define SURFACE_SYNC_BUSY(x) (((x) >> 21) & 0x1)
|
||||
#define DMA_BUSY(x) (((x) >> 22) & 0x1)
|
||||
#define SCRATCH_RAM_BUSY(x) (((x) >> 24) & 0x1)
|
||||
|
||||
#define IDENTITY(x) x
|
||||
|
||||
#define UPDATE_COUNTER(field, mask) \
|
||||
do { \
|
||||
if (mask(value)) \
|
||||
p_atomic_inc(&counters->named.field.busy); \
|
||||
else \
|
||||
p_atomic_inc(&counters->named.field.idle); \
|
||||
} while (0)
|
||||
#define UPDATE_COUNTER(field, mask) \
|
||||
do { \
|
||||
if (mask(value)) \
|
||||
p_atomic_inc(&counters->named.field.busy); \
|
||||
else \
|
||||
p_atomic_inc(&counters->named.field.idle); \
|
||||
} while (0)
|
||||
|
||||
static void si_update_mmio_counters(struct si_screen *sscreen,
|
||||
union si_mmio_counters *counters)
|
||||
static void si_update_mmio_counters(struct si_screen *sscreen, union si_mmio_counters *counters)
|
||||
{
|
||||
uint32_t value = 0;
|
||||
bool gui_busy, sdma_busy = false;
|
||||
uint32_t value = 0;
|
||||
bool gui_busy, sdma_busy = false;
|
||||
|
||||
/* GRBM_STATUS */
|
||||
sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
|
||||
/* GRBM_STATUS */
|
||||
sscreen->ws->read_registers(sscreen->ws, GRBM_STATUS, 1, &value);
|
||||
|
||||
UPDATE_COUNTER(ta, TA_BUSY);
|
||||
UPDATE_COUNTER(gds, GDS_BUSY);
|
||||
UPDATE_COUNTER(vgt, VGT_BUSY);
|
||||
UPDATE_COUNTER(ia, IA_BUSY);
|
||||
UPDATE_COUNTER(sx, SX_BUSY);
|
||||
UPDATE_COUNTER(wd, WD_BUSY);
|
||||
UPDATE_COUNTER(spi, SPI_BUSY);
|
||||
UPDATE_COUNTER(bci, BCI_BUSY);
|
||||
UPDATE_COUNTER(sc, SC_BUSY);
|
||||
UPDATE_COUNTER(pa, PA_BUSY);
|
||||
UPDATE_COUNTER(db, DB_BUSY);
|
||||
UPDATE_COUNTER(cp, CP_BUSY);
|
||||
UPDATE_COUNTER(cb, CB_BUSY);
|
||||
UPDATE_COUNTER(gui, GUI_ACTIVE);
|
||||
gui_busy = GUI_ACTIVE(value);
|
||||
UPDATE_COUNTER(ta, TA_BUSY);
|
||||
UPDATE_COUNTER(gds, GDS_BUSY);
|
||||
UPDATE_COUNTER(vgt, VGT_BUSY);
|
||||
UPDATE_COUNTER(ia, IA_BUSY);
|
||||
UPDATE_COUNTER(sx, SX_BUSY);
|
||||
UPDATE_COUNTER(wd, WD_BUSY);
|
||||
UPDATE_COUNTER(spi, SPI_BUSY);
|
||||
UPDATE_COUNTER(bci, BCI_BUSY);
|
||||
UPDATE_COUNTER(sc, SC_BUSY);
|
||||
UPDATE_COUNTER(pa, PA_BUSY);
|
||||
UPDATE_COUNTER(db, DB_BUSY);
|
||||
UPDATE_COUNTER(cp, CP_BUSY);
|
||||
UPDATE_COUNTER(cb, CB_BUSY);
|
||||
UPDATE_COUNTER(gui, GUI_ACTIVE);
|
||||
gui_busy = GUI_ACTIVE(value);
|
||||
|
||||
if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
|
||||
/* SRBM_STATUS2 */
|
||||
sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
|
||||
if (sscreen->info.chip_class == GFX7 || sscreen->info.chip_class == GFX8) {
|
||||
/* SRBM_STATUS2 */
|
||||
sscreen->ws->read_registers(sscreen->ws, SRBM_STATUS2, 1, &value);
|
||||
|
||||
UPDATE_COUNTER(sdma, SDMA_BUSY);
|
||||
sdma_busy = SDMA_BUSY(value);
|
||||
}
|
||||
UPDATE_COUNTER(sdma, SDMA_BUSY);
|
||||
sdma_busy = SDMA_BUSY(value);
|
||||
}
|
||||
|
||||
if (sscreen->info.chip_class >= GFX8) {
|
||||
/* CP_STAT */
|
||||
sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
|
||||
if (sscreen->info.chip_class >= GFX8) {
|
||||
/* CP_STAT */
|
||||
sscreen->ws->read_registers(sscreen->ws, CP_STAT, 1, &value);
|
||||
|
||||
UPDATE_COUNTER(pfp, PFP_BUSY);
|
||||
UPDATE_COUNTER(meq, MEQ_BUSY);
|
||||
UPDATE_COUNTER(me, ME_BUSY);
|
||||
UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
|
||||
UPDATE_COUNTER(cp_dma, DMA_BUSY);
|
||||
UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
|
||||
}
|
||||
UPDATE_COUNTER(pfp, PFP_BUSY);
|
||||
UPDATE_COUNTER(meq, MEQ_BUSY);
|
||||
UPDATE_COUNTER(me, ME_BUSY);
|
||||
UPDATE_COUNTER(surf_sync, SURFACE_SYNC_BUSY);
|
||||
UPDATE_COUNTER(cp_dma, DMA_BUSY);
|
||||
UPDATE_COUNTER(scratch_ram, SCRATCH_RAM_BUSY);
|
||||
}
|
||||
|
||||
value = gui_busy || sdma_busy;
|
||||
UPDATE_COUNTER(gpu, IDENTITY);
|
||||
value = gui_busy || sdma_busy;
|
||||
UPDATE_COUNTER(gpu, IDENTITY);
|
||||
}
|
||||
|
||||
#undef UPDATE_COUNTER
|
||||
|
||||
static int
|
||||
si_gpu_load_thread(void *param)
|
||||
static int si_gpu_load_thread(void *param)
|
||||
{
|
||||
struct si_screen *sscreen = (struct si_screen*)param;
|
||||
const int period_us = 1000000 / SAMPLES_PER_SEC;
|
||||
int sleep_us = period_us;
|
||||
int64_t cur_time, last_time = os_time_get();
|
||||
struct si_screen *sscreen = (struct si_screen *)param;
|
||||
const int period_us = 1000000 / SAMPLES_PER_SEC;
|
||||
int sleep_us = period_us;
|
||||
int64_t cur_time, last_time = os_time_get();
|
||||
|
||||
while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
|
||||
if (sleep_us)
|
||||
os_time_sleep(sleep_us);
|
||||
while (!p_atomic_read(&sscreen->gpu_load_stop_thread)) {
|
||||
if (sleep_us)
|
||||
os_time_sleep(sleep_us);
|
||||
|
||||
/* Make sure we sleep the ideal amount of time to match
|
||||
* the expected frequency. */
|
||||
cur_time = os_time_get();
|
||||
/* Make sure we sleep the ideal amount of time to match
|
||||
* the expected frequency. */
|
||||
cur_time = os_time_get();
|
||||
|
||||
if (os_time_timeout(last_time, last_time + period_us,
|
||||
cur_time))
|
||||
sleep_us = MAX2(sleep_us - 1, 1);
|
||||
else
|
||||
sleep_us += 1;
|
||||
if (os_time_timeout(last_time, last_time + period_us, cur_time))
|
||||
sleep_us = MAX2(sleep_us - 1, 1);
|
||||
else
|
||||
sleep_us += 1;
|
||||
|
||||
/*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
|
||||
last_time = cur_time;
|
||||
/*printf("Hz: %.1f\n", 1000000.0 / (cur_time - last_time));*/
|
||||
last_time = cur_time;
|
||||
|
||||
/* Update the counters. */
|
||||
si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
|
||||
}
|
||||
p_atomic_dec(&sscreen->gpu_load_stop_thread);
|
||||
return 0;
|
||||
/* Update the counters. */
|
||||
si_update_mmio_counters(sscreen, &sscreen->mmio_counters);
|
||||
}
|
||||
p_atomic_dec(&sscreen->gpu_load_stop_thread);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void si_gpu_load_kill_thread(struct si_screen *sscreen)
|
||||
{
|
||||
if (!sscreen->gpu_load_thread)
|
||||
return;
|
||||
if (!sscreen->gpu_load_thread)
|
||||
return;
|
||||
|
||||
p_atomic_inc(&sscreen->gpu_load_stop_thread);
|
||||
thrd_join(sscreen->gpu_load_thread, NULL);
|
||||
sscreen->gpu_load_thread = 0;
|
||||
p_atomic_inc(&sscreen->gpu_load_stop_thread);
|
||||
thrd_join(sscreen->gpu_load_thread, NULL);
|
||||
sscreen->gpu_load_thread = 0;
|
||||
}
|
||||
|
||||
static uint64_t si_read_mmio_counter(struct si_screen *sscreen,
|
||||
unsigned busy_index)
|
||||
static uint64_t si_read_mmio_counter(struct si_screen *sscreen, unsigned busy_index)
|
||||
{
|
||||
/* Start the thread if needed. */
|
||||
if (!sscreen->gpu_load_thread) {
|
||||
simple_mtx_lock(&sscreen->gpu_load_mutex);
|
||||
/* Check again inside the mutex. */
|
||||
if (!sscreen->gpu_load_thread)
|
||||
sscreen->gpu_load_thread =
|
||||
u_thread_create(si_gpu_load_thread, sscreen);
|
||||
simple_mtx_unlock(&sscreen->gpu_load_mutex);
|
||||
}
|
||||
/* Start the thread if needed. */
|
||||
if (!sscreen->gpu_load_thread) {
|
||||
simple_mtx_lock(&sscreen->gpu_load_mutex);
|
||||
/* Check again inside the mutex. */
|
||||
if (!sscreen->gpu_load_thread)
|
||||
sscreen->gpu_load_thread = u_thread_create(si_gpu_load_thread, sscreen);
|
||||
simple_mtx_unlock(&sscreen->gpu_load_mutex);
|
||||
}
|
||||
|
||||
unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
|
||||
unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
|
||||
unsigned busy = p_atomic_read(&sscreen->mmio_counters.array[busy_index]);
|
||||
unsigned idle = p_atomic_read(&sscreen->mmio_counters.array[busy_index + 1]);
|
||||
|
||||
return busy | ((uint64_t)idle << 32);
|
||||
return busy | ((uint64_t)idle << 32);
|
||||
}
|
||||
|
||||
static unsigned si_end_mmio_counter(struct si_screen *sscreen,
|
||||
uint64_t begin, unsigned busy_index)
|
||||
static unsigned si_end_mmio_counter(struct si_screen *sscreen, uint64_t begin, unsigned busy_index)
|
||||
{
|
||||
uint64_t end = si_read_mmio_counter(sscreen, busy_index);
|
||||
unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
|
||||
unsigned idle = (end >> 32) - (begin >> 32);
|
||||
uint64_t end = si_read_mmio_counter(sscreen, busy_index);
|
||||
unsigned busy = (end & 0xffffffff) - (begin & 0xffffffff);
|
||||
unsigned idle = (end >> 32) - (begin >> 32);
|
||||
|
||||
/* Calculate the % of time the busy counter was being incremented.
|
||||
*
|
||||
* If no counters were incremented, return the current counter status.
|
||||
* It's for the case when the load is queried faster than
|
||||
* the counters are updated.
|
||||
*/
|
||||
if (idle || busy) {
|
||||
return busy*100 / (busy + idle);
|
||||
} else {
|
||||
union si_mmio_counters counters;
|
||||
/* Calculate the % of time the busy counter was being incremented.
|
||||
*
|
||||
* If no counters were incremented, return the current counter status.
|
||||
* It's for the case when the load is queried faster than
|
||||
* the counters are updated.
|
||||
*/
|
||||
if (idle || busy) {
|
||||
return busy * 100 / (busy + idle);
|
||||
} else {
|
||||
union si_mmio_counters counters;
|
||||
|
||||
memset(&counters, 0, sizeof(counters));
|
||||
si_update_mmio_counters(sscreen, &counters);
|
||||
return counters.array[busy_index] ? 100 : 0;
|
||||
}
|
||||
memset(&counters, 0, sizeof(counters));
|
||||
si_update_mmio_counters(sscreen, &counters);
|
||||
return counters.array[busy_index] ? 100 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
#define BUSY_INDEX(sscreen, field) (&sscreen->mmio_counters.named.field.busy - \
|
||||
sscreen->mmio_counters.array)
|
||||
#define BUSY_INDEX(sscreen, field) \
|
||||
(&sscreen->mmio_counters.named.field.busy - sscreen->mmio_counters.array)
|
||||
|
||||
static unsigned busy_index_from_type(struct si_screen *sscreen,
|
||||
unsigned type)
|
||||
static unsigned busy_index_from_type(struct si_screen *sscreen, unsigned type)
|
||||
{
|
||||
switch (type) {
|
||||
case SI_QUERY_GPU_LOAD:
|
||||
return BUSY_INDEX(sscreen, gpu);
|
||||
case SI_QUERY_GPU_SHADERS_BUSY:
|
||||
return BUSY_INDEX(sscreen, spi);
|
||||
case SI_QUERY_GPU_TA_BUSY:
|
||||
return BUSY_INDEX(sscreen, ta);
|
||||
case SI_QUERY_GPU_GDS_BUSY:
|
||||
return BUSY_INDEX(sscreen, gds);
|
||||
case SI_QUERY_GPU_VGT_BUSY:
|
||||
return BUSY_INDEX(sscreen, vgt);
|
||||
case SI_QUERY_GPU_IA_BUSY:
|
||||
return BUSY_INDEX(sscreen, ia);
|
||||
case SI_QUERY_GPU_SX_BUSY:
|
||||
return BUSY_INDEX(sscreen, sx);
|
||||
case SI_QUERY_GPU_WD_BUSY:
|
||||
return BUSY_INDEX(sscreen, wd);
|
||||
case SI_QUERY_GPU_BCI_BUSY:
|
||||
return BUSY_INDEX(sscreen, bci);
|
||||
case SI_QUERY_GPU_SC_BUSY:
|
||||
return BUSY_INDEX(sscreen, sc);
|
||||
case SI_QUERY_GPU_PA_BUSY:
|
||||
return BUSY_INDEX(sscreen, pa);
|
||||
case SI_QUERY_GPU_DB_BUSY:
|
||||
return BUSY_INDEX(sscreen, db);
|
||||
case SI_QUERY_GPU_CP_BUSY:
|
||||
return BUSY_INDEX(sscreen, cp);
|
||||
case SI_QUERY_GPU_CB_BUSY:
|
||||
return BUSY_INDEX(sscreen, cb);
|
||||
case SI_QUERY_GPU_SDMA_BUSY:
|
||||
return BUSY_INDEX(sscreen, sdma);
|
||||
case SI_QUERY_GPU_PFP_BUSY:
|
||||
return BUSY_INDEX(sscreen, pfp);
|
||||
case SI_QUERY_GPU_MEQ_BUSY:
|
||||
return BUSY_INDEX(sscreen, meq);
|
||||
case SI_QUERY_GPU_ME_BUSY:
|
||||
return BUSY_INDEX(sscreen, me);
|
||||
case SI_QUERY_GPU_SURF_SYNC_BUSY:
|
||||
return BUSY_INDEX(sscreen, surf_sync);
|
||||
case SI_QUERY_GPU_CP_DMA_BUSY:
|
||||
return BUSY_INDEX(sscreen, cp_dma);
|
||||
case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
|
||||
return BUSY_INDEX(sscreen, scratch_ram);
|
||||
default:
|
||||
unreachable("invalid query type");
|
||||
}
|
||||
switch (type) {
|
||||
case SI_QUERY_GPU_LOAD:
|
||||
return BUSY_INDEX(sscreen, gpu);
|
||||
case SI_QUERY_GPU_SHADERS_BUSY:
|
||||
return BUSY_INDEX(sscreen, spi);
|
||||
case SI_QUERY_GPU_TA_BUSY:
|
||||
return BUSY_INDEX(sscreen, ta);
|
||||
case SI_QUERY_GPU_GDS_BUSY:
|
||||
return BUSY_INDEX(sscreen, gds);
|
||||
case SI_QUERY_GPU_VGT_BUSY:
|
||||
return BUSY_INDEX(sscreen, vgt);
|
||||
case SI_QUERY_GPU_IA_BUSY:
|
||||
return BUSY_INDEX(sscreen, ia);
|
||||
case SI_QUERY_GPU_SX_BUSY:
|
||||
return BUSY_INDEX(sscreen, sx);
|
||||
case SI_QUERY_GPU_WD_BUSY:
|
||||
return BUSY_INDEX(sscreen, wd);
|
||||
case SI_QUERY_GPU_BCI_BUSY:
|
||||
return BUSY_INDEX(sscreen, bci);
|
||||
case SI_QUERY_GPU_SC_BUSY:
|
||||
return BUSY_INDEX(sscreen, sc);
|
||||
case SI_QUERY_GPU_PA_BUSY:
|
||||
return BUSY_INDEX(sscreen, pa);
|
||||
case SI_QUERY_GPU_DB_BUSY:
|
||||
return BUSY_INDEX(sscreen, db);
|
||||
case SI_QUERY_GPU_CP_BUSY:
|
||||
return BUSY_INDEX(sscreen, cp);
|
||||
case SI_QUERY_GPU_CB_BUSY:
|
||||
return BUSY_INDEX(sscreen, cb);
|
||||
case SI_QUERY_GPU_SDMA_BUSY:
|
||||
return BUSY_INDEX(sscreen, sdma);
|
||||
case SI_QUERY_GPU_PFP_BUSY:
|
||||
return BUSY_INDEX(sscreen, pfp);
|
||||
case SI_QUERY_GPU_MEQ_BUSY:
|
||||
return BUSY_INDEX(sscreen, meq);
|
||||
case SI_QUERY_GPU_ME_BUSY:
|
||||
return BUSY_INDEX(sscreen, me);
|
||||
case SI_QUERY_GPU_SURF_SYNC_BUSY:
|
||||
return BUSY_INDEX(sscreen, surf_sync);
|
||||
case SI_QUERY_GPU_CP_DMA_BUSY:
|
||||
return BUSY_INDEX(sscreen, cp_dma);
|
||||
case SI_QUERY_GPU_SCRATCH_RAM_BUSY:
|
||||
return BUSY_INDEX(sscreen, scratch_ram);
|
||||
default:
|
||||
unreachable("invalid query type");
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type)
|
||||
{
|
||||
unsigned busy_index = busy_index_from_type(sscreen, type);
|
||||
return si_read_mmio_counter(sscreen, busy_index);
|
||||
unsigned busy_index = busy_index_from_type(sscreen, type);
|
||||
return si_read_mmio_counter(sscreen, busy_index);
|
||||
}
|
||||
|
||||
unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
|
||||
uint64_t begin)
|
||||
unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin)
|
||||
{
|
||||
unsigned busy_index = busy_index_from_type(sscreen, type);
|
||||
return si_end_mmio_counter(sscreen, begin, busy_index);
|
||||
unsigned busy_index = busy_index_from_type(sscreen, type);
|
||||
return si_end_mmio_counter(sscreen, begin, busy_index);
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -22,170 +22,159 @@
|
|||
* USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "util/u_memory.h"
|
||||
#include "si_pipe.h"
|
||||
#include "sid.h"
|
||||
#include "util/u_memory.h"
|
||||
|
||||
void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode)
|
||||
{
|
||||
state->last_opcode = opcode;
|
||||
state->last_pm4 = state->ndw++;
|
||||
state->last_opcode = opcode;
|
||||
state->last_pm4 = state->ndw++;
|
||||
}
|
||||
|
||||
void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw)
|
||||
{
|
||||
state->pm4[state->ndw++] = dw;
|
||||
state->pm4[state->ndw++] = dw;
|
||||
}
|
||||
|
||||
void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate)
|
||||
{
|
||||
unsigned count;
|
||||
count = state->ndw - state->last_pm4 - 2;
|
||||
state->pm4[state->last_pm4] =
|
||||
PKT3(state->last_opcode, count, predicate);
|
||||
unsigned count;
|
||||
count = state->ndw - state->last_pm4 - 2;
|
||||
state->pm4[state->last_pm4] = PKT3(state->last_opcode, count, predicate);
|
||||
|
||||
assert(state->ndw <= SI_PM4_MAX_DW);
|
||||
assert(state->ndw <= SI_PM4_MAX_DW);
|
||||
}
|
||||
|
||||
void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val)
|
||||
{
|
||||
unsigned opcode;
|
||||
unsigned opcode;
|
||||
|
||||
if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
|
||||
opcode = PKT3_SET_CONFIG_REG;
|
||||
reg -= SI_CONFIG_REG_OFFSET;
|
||||
if (reg >= SI_CONFIG_REG_OFFSET && reg < SI_CONFIG_REG_END) {
|
||||
opcode = PKT3_SET_CONFIG_REG;
|
||||
reg -= SI_CONFIG_REG_OFFSET;
|
||||
|
||||
} else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
|
||||
opcode = PKT3_SET_SH_REG;
|
||||
reg -= SI_SH_REG_OFFSET;
|
||||
} else if (reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END) {
|
||||
opcode = PKT3_SET_SH_REG;
|
||||
reg -= SI_SH_REG_OFFSET;
|
||||
|
||||
} else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
|
||||
opcode = PKT3_SET_CONTEXT_REG;
|
||||
reg -= SI_CONTEXT_REG_OFFSET;
|
||||
} else if (reg >= SI_CONTEXT_REG_OFFSET && reg < SI_CONTEXT_REG_END) {
|
||||
opcode = PKT3_SET_CONTEXT_REG;
|
||||
reg -= SI_CONTEXT_REG_OFFSET;
|
||||
|
||||
} else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
|
||||
opcode = PKT3_SET_UCONFIG_REG;
|
||||
reg -= CIK_UCONFIG_REG_OFFSET;
|
||||
} else if (reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END) {
|
||||
opcode = PKT3_SET_UCONFIG_REG;
|
||||
reg -= CIK_UCONFIG_REG_OFFSET;
|
||||
|
||||
} else {
|
||||
PRINT_ERR("Invalid register offset %08x!\n", reg);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
PRINT_ERR("Invalid register offset %08x!\n", reg);
|
||||
return;
|
||||
}
|
||||
|
||||
reg >>= 2;
|
||||
reg >>= 2;
|
||||
|
||||
if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
|
||||
si_pm4_cmd_begin(state, opcode);
|
||||
si_pm4_cmd_add(state, reg);
|
||||
}
|
||||
if (opcode != state->last_opcode || reg != (state->last_reg + 1)) {
|
||||
si_pm4_cmd_begin(state, opcode);
|
||||
si_pm4_cmd_add(state, reg);
|
||||
}
|
||||
|
||||
state->last_reg = reg;
|
||||
si_pm4_cmd_add(state, val);
|
||||
si_pm4_cmd_end(state, false);
|
||||
state->last_reg = reg;
|
||||
si_pm4_cmd_add(state, val);
|
||||
si_pm4_cmd_end(state, false);
|
||||
}
|
||||
|
||||
void si_pm4_add_bo(struct si_pm4_state *state,
|
||||
struct si_resource *bo,
|
||||
enum radeon_bo_usage usage,
|
||||
enum radeon_bo_priority priority)
|
||||
void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
|
||||
enum radeon_bo_priority priority)
|
||||
{
|
||||
unsigned idx = state->nbo++;
|
||||
assert(idx < SI_PM4_MAX_BO);
|
||||
unsigned idx = state->nbo++;
|
||||
assert(idx < SI_PM4_MAX_BO);
|
||||
|
||||
si_resource_reference(&state->bo[idx], bo);
|
||||
state->bo_usage[idx] = usage;
|
||||
state->bo_priority[idx] = priority;
|
||||
si_resource_reference(&state->bo[idx], bo);
|
||||
state->bo_usage[idx] = usage;
|
||||
state->bo_priority[idx] = priority;
|
||||
}
|
||||
|
||||
void si_pm4_clear_state(struct si_pm4_state *state)
|
||||
{
|
||||
for (int i = 0; i < state->nbo; ++i)
|
||||
si_resource_reference(&state->bo[i], NULL);
|
||||
si_resource_reference(&state->indirect_buffer, NULL);
|
||||
state->nbo = 0;
|
||||
state->ndw = 0;
|
||||
for (int i = 0; i < state->nbo; ++i)
|
||||
si_resource_reference(&state->bo[i], NULL);
|
||||
si_resource_reference(&state->indirect_buffer, NULL);
|
||||
state->nbo = 0;
|
||||
state->ndw = 0;
|
||||
}
|
||||
|
||||
void si_pm4_free_state(struct si_context *sctx,
|
||||
struct si_pm4_state *state,
|
||||
unsigned idx)
|
||||
void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx)
|
||||
{
|
||||
if (!state)
|
||||
return;
|
||||
if (!state)
|
||||
return;
|
||||
|
||||
if (idx != ~0 && sctx->emitted.array[idx] == state) {
|
||||
sctx->emitted.array[idx] = NULL;
|
||||
}
|
||||
if (idx != ~0 && sctx->emitted.array[idx] == state) {
|
||||
sctx->emitted.array[idx] = NULL;
|
||||
}
|
||||
|
||||
si_pm4_clear_state(state);
|
||||
FREE(state);
|
||||
si_pm4_clear_state(state);
|
||||
FREE(state);
|
||||
}
|
||||
|
||||
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
|
||||
for (int i = 0; i < state->nbo; ++i) {
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i],
|
||||
state->bo_usage[i], state->bo_priority[i]);
|
||||
}
|
||||
for (int i = 0; i < state->nbo; ++i) {
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i], state->bo_usage[i],
|
||||
state->bo_priority[i]);
|
||||
}
|
||||
|
||||
if (!state->indirect_buffer) {
|
||||
radeon_emit_array(cs, state->pm4, state->ndw);
|
||||
} else {
|
||||
struct si_resource *ib = state->indirect_buffer;
|
||||
if (!state->indirect_buffer) {
|
||||
radeon_emit_array(cs, state->pm4, state->ndw);
|
||||
} else {
|
||||
struct si_resource *ib = state->indirect_buffer;
|
||||
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib,
|
||||
RADEON_USAGE_READ,
|
||||
RADEON_PRIO_IB2);
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2);
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
|
||||
radeon_emit(cs, ib->gpu_address);
|
||||
radeon_emit(cs, ib->gpu_address >> 32);
|
||||
radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
|
||||
}
|
||||
radeon_emit(cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
|
||||
radeon_emit(cs, ib->gpu_address);
|
||||
radeon_emit(cs, ib->gpu_address >> 32);
|
||||
radeon_emit(cs, (ib->b.b.width0 >> 2) & 0xfffff);
|
||||
}
|
||||
|
||||
if (state->atom.emit)
|
||||
state->atom.emit(sctx);
|
||||
if (state->atom.emit)
|
||||
state->atom.emit(sctx);
|
||||
}
|
||||
|
||||
void si_pm4_reset_emitted(struct si_context *sctx)
|
||||
{
|
||||
memset(&sctx->emitted, 0, sizeof(sctx->emitted));
|
||||
sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
|
||||
memset(&sctx->emitted, 0, sizeof(sctx->emitted));
|
||||
sctx->dirty_states |= u_bit_consecutive(0, SI_NUM_STATES);
|
||||
}
|
||||
|
||||
void si_pm4_upload_indirect_buffer(struct si_context *sctx,
|
||||
struct si_pm4_state *state)
|
||||
void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state)
|
||||
{
|
||||
struct pipe_screen *screen = sctx->b.screen;
|
||||
unsigned aligned_ndw = align(state->ndw, 8);
|
||||
struct pipe_screen *screen = sctx->b.screen;
|
||||
unsigned aligned_ndw = align(state->ndw, 8);
|
||||
|
||||
/* only supported on GFX7 and later */
|
||||
if (sctx->chip_class < GFX7)
|
||||
return;
|
||||
/* only supported on GFX7 and later */
|
||||
if (sctx->chip_class < GFX7)
|
||||
return;
|
||||
|
||||
assert(state->ndw);
|
||||
assert(aligned_ndw <= SI_PM4_MAX_DW);
|
||||
assert(state->ndw);
|
||||
assert(aligned_ndw <= SI_PM4_MAX_DW);
|
||||
|
||||
si_resource_reference(&state->indirect_buffer, NULL);
|
||||
/* TODO: this hangs with 1024 or higher alignment on GFX9. */
|
||||
state->indirect_buffer =
|
||||
si_aligned_buffer_create(screen, 0,
|
||||
PIPE_USAGE_DEFAULT, aligned_ndw * 4,
|
||||
256);
|
||||
if (!state->indirect_buffer)
|
||||
return;
|
||||
si_resource_reference(&state->indirect_buffer, NULL);
|
||||
/* TODO: this hangs with 1024 or higher alignment on GFX9. */
|
||||
state->indirect_buffer =
|
||||
si_aligned_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, aligned_ndw * 4, 256);
|
||||
if (!state->indirect_buffer)
|
||||
return;
|
||||
|
||||
/* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
|
||||
if (sctx->screen->info.gfx_ib_pad_with_type2) {
|
||||
for (int i = state->ndw; i < aligned_ndw; i++)
|
||||
state->pm4[i] = 0x80000000; /* type2 nop packet */
|
||||
} else {
|
||||
for (int i = state->ndw; i < aligned_ndw; i++)
|
||||
state->pm4[i] = 0xffff1000; /* type3 nop packet */
|
||||
}
|
||||
/* Pad the IB to 8 DWs to meet CP fetch alignment requirements. */
|
||||
if (sctx->screen->info.gfx_ib_pad_with_type2) {
|
||||
for (int i = state->ndw; i < aligned_ndw; i++)
|
||||
state->pm4[i] = 0x80000000; /* type2 nop packet */
|
||||
} else {
|
||||
for (int i = state->ndw; i < aligned_ndw; i++)
|
||||
state->pm4[i] = 0xffff1000; /* type3 nop packet */
|
||||
}
|
||||
|
||||
pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b,
|
||||
0, aligned_ndw *4, state->pm4);
|
||||
pipe_buffer_write(&sctx->b, &state->indirect_buffer->b.b, 0, aligned_ndw * 4, state->pm4);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,8 +27,8 @@
|
|||
|
||||
#include "radeon/radeon_winsys.h"
|
||||
|
||||
#define SI_PM4_MAX_DW 176
|
||||
#define SI_PM4_MAX_BO 3
|
||||
#define SI_PM4_MAX_DW 176
|
||||
#define SI_PM4_MAX_BO 3
|
||||
|
||||
// forward defines
|
||||
struct si_context;
|
||||
|
|
@ -37,32 +37,31 @@ struct si_context;
|
|||
* command buffer (AKA indirect buffer, AKA IB, AKA command stream, AKA CS).
|
||||
*/
|
||||
struct si_atom {
|
||||
void (*emit)(struct si_context *ctx);
|
||||
void (*emit)(struct si_context *ctx);
|
||||
};
|
||||
|
||||
struct si_pm4_state
|
||||
{
|
||||
/* optional indirect buffer */
|
||||
struct si_resource *indirect_buffer;
|
||||
struct si_pm4_state {
|
||||
/* optional indirect buffer */
|
||||
struct si_resource *indirect_buffer;
|
||||
|
||||
/* PKT3_SET_*_REG handling */
|
||||
unsigned last_opcode;
|
||||
unsigned last_reg;
|
||||
unsigned last_pm4;
|
||||
/* PKT3_SET_*_REG handling */
|
||||
unsigned last_opcode;
|
||||
unsigned last_reg;
|
||||
unsigned last_pm4;
|
||||
|
||||
/* commands for the DE */
|
||||
unsigned ndw;
|
||||
uint32_t pm4[SI_PM4_MAX_DW];
|
||||
/* commands for the DE */
|
||||
unsigned ndw;
|
||||
uint32_t pm4[SI_PM4_MAX_DW];
|
||||
|
||||
/* BO's referenced by this state */
|
||||
unsigned nbo;
|
||||
struct si_resource *bo[SI_PM4_MAX_BO];
|
||||
enum radeon_bo_usage bo_usage[SI_PM4_MAX_BO];
|
||||
enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO];
|
||||
/* BO's referenced by this state */
|
||||
unsigned nbo;
|
||||
struct si_resource *bo[SI_PM4_MAX_BO];
|
||||
enum radeon_bo_usage bo_usage[SI_PM4_MAX_BO];
|
||||
enum radeon_bo_priority bo_priority[SI_PM4_MAX_BO];
|
||||
|
||||
/* For shader states only */
|
||||
struct si_shader *shader;
|
||||
struct si_atom atom;
|
||||
/* For shader states only */
|
||||
struct si_shader *shader;
|
||||
struct si_atom atom;
|
||||
};
|
||||
|
||||
void si_pm4_cmd_begin(struct si_pm4_state *state, unsigned opcode);
|
||||
|
|
@ -70,17 +69,12 @@ void si_pm4_cmd_add(struct si_pm4_state *state, uint32_t dw);
|
|||
void si_pm4_cmd_end(struct si_pm4_state *state, bool predicate);
|
||||
|
||||
void si_pm4_set_reg(struct si_pm4_state *state, unsigned reg, uint32_t val);
|
||||
void si_pm4_add_bo(struct si_pm4_state *state,
|
||||
struct si_resource *bo,
|
||||
enum radeon_bo_usage usage,
|
||||
enum radeon_bo_priority priority);
|
||||
void si_pm4_upload_indirect_buffer(struct si_context *sctx,
|
||||
struct si_pm4_state *state);
|
||||
void si_pm4_add_bo(struct si_pm4_state *state, struct si_resource *bo, enum radeon_bo_usage usage,
|
||||
enum radeon_bo_priority priority);
|
||||
void si_pm4_upload_indirect_buffer(struct si_context *sctx, struct si_pm4_state *state);
|
||||
|
||||
void si_pm4_clear_state(struct si_pm4_state *state);
|
||||
void si_pm4_free_state(struct si_context *sctx,
|
||||
struct si_pm4_state *state,
|
||||
unsigned idx);
|
||||
void si_pm4_free_state(struct si_context *sctx, struct si_pm4_state *state, unsigned idx);
|
||||
|
||||
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state);
|
||||
void si_pm4_reset_emitted(struct si_context *sctx);
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -40,236 +40,220 @@ struct si_resource;
|
|||
|
||||
#define SI_MAX_STREAMS 4
|
||||
|
||||
enum {
|
||||
SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
|
||||
SI_QUERY_DECOMPRESS_CALLS,
|
||||
SI_QUERY_MRT_DRAW_CALLS,
|
||||
SI_QUERY_PRIM_RESTART_CALLS,
|
||||
SI_QUERY_SPILL_DRAW_CALLS,
|
||||
SI_QUERY_COMPUTE_CALLS,
|
||||
SI_QUERY_SPILL_COMPUTE_CALLS,
|
||||
SI_QUERY_DMA_CALLS,
|
||||
SI_QUERY_CP_DMA_CALLS,
|
||||
SI_QUERY_NUM_VS_FLUSHES,
|
||||
SI_QUERY_NUM_PS_FLUSHES,
|
||||
SI_QUERY_NUM_CS_FLUSHES,
|
||||
SI_QUERY_NUM_CB_CACHE_FLUSHES,
|
||||
SI_QUERY_NUM_DB_CACHE_FLUSHES,
|
||||
SI_QUERY_NUM_L2_INVALIDATES,
|
||||
SI_QUERY_NUM_L2_WRITEBACKS,
|
||||
SI_QUERY_NUM_RESIDENT_HANDLES,
|
||||
SI_QUERY_TC_OFFLOADED_SLOTS,
|
||||
SI_QUERY_TC_DIRECT_SLOTS,
|
||||
SI_QUERY_TC_NUM_SYNCS,
|
||||
SI_QUERY_CS_THREAD_BUSY,
|
||||
SI_QUERY_GALLIUM_THREAD_BUSY,
|
||||
SI_QUERY_REQUESTED_VRAM,
|
||||
SI_QUERY_REQUESTED_GTT,
|
||||
SI_QUERY_MAPPED_VRAM,
|
||||
SI_QUERY_MAPPED_GTT,
|
||||
SI_QUERY_BUFFER_WAIT_TIME,
|
||||
SI_QUERY_NUM_MAPPED_BUFFERS,
|
||||
SI_QUERY_NUM_GFX_IBS,
|
||||
SI_QUERY_NUM_SDMA_IBS,
|
||||
SI_QUERY_GFX_BO_LIST_SIZE,
|
||||
SI_QUERY_GFX_IB_SIZE,
|
||||
SI_QUERY_NUM_BYTES_MOVED,
|
||||
SI_QUERY_NUM_EVICTIONS,
|
||||
SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
|
||||
SI_QUERY_VRAM_USAGE,
|
||||
SI_QUERY_VRAM_VIS_USAGE,
|
||||
SI_QUERY_GTT_USAGE,
|
||||
SI_QUERY_GPU_TEMPERATURE,
|
||||
SI_QUERY_CURRENT_GPU_SCLK,
|
||||
SI_QUERY_CURRENT_GPU_MCLK,
|
||||
SI_QUERY_GPU_LOAD,
|
||||
SI_QUERY_GPU_SHADERS_BUSY,
|
||||
SI_QUERY_GPU_TA_BUSY,
|
||||
SI_QUERY_GPU_GDS_BUSY,
|
||||
SI_QUERY_GPU_VGT_BUSY,
|
||||
SI_QUERY_GPU_IA_BUSY,
|
||||
SI_QUERY_GPU_SX_BUSY,
|
||||
SI_QUERY_GPU_WD_BUSY,
|
||||
SI_QUERY_GPU_BCI_BUSY,
|
||||
SI_QUERY_GPU_SC_BUSY,
|
||||
SI_QUERY_GPU_PA_BUSY,
|
||||
SI_QUERY_GPU_DB_BUSY,
|
||||
SI_QUERY_GPU_CP_BUSY,
|
||||
SI_QUERY_GPU_CB_BUSY,
|
||||
SI_QUERY_GPU_SDMA_BUSY,
|
||||
SI_QUERY_GPU_PFP_BUSY,
|
||||
SI_QUERY_GPU_MEQ_BUSY,
|
||||
SI_QUERY_GPU_ME_BUSY,
|
||||
SI_QUERY_GPU_SURF_SYNC_BUSY,
|
||||
SI_QUERY_GPU_CP_DMA_BUSY,
|
||||
SI_QUERY_GPU_SCRATCH_RAM_BUSY,
|
||||
SI_QUERY_NUM_COMPILATIONS,
|
||||
SI_QUERY_NUM_SHADERS_CREATED,
|
||||
SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
|
||||
SI_QUERY_GPIN_ASIC_ID,
|
||||
SI_QUERY_GPIN_NUM_SIMD,
|
||||
SI_QUERY_GPIN_NUM_RB,
|
||||
SI_QUERY_GPIN_NUM_SPI,
|
||||
SI_QUERY_GPIN_NUM_SE,
|
||||
SI_QUERY_TIME_ELAPSED_SDMA,
|
||||
SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
|
||||
SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
|
||||
SI_QUERY_PD_NUM_PRIMS_REJECTED,
|
||||
SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
|
||||
SI_QUERY_LIVE_SHADER_CACHE_HITS,
|
||||
SI_QUERY_LIVE_SHADER_CACHE_MISSES,
|
||||
SI_QUERY_MEMORY_SHADER_CACHE_HITS,
|
||||
SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
|
||||
SI_QUERY_DISK_SHADER_CACHE_HITS,
|
||||
SI_QUERY_DISK_SHADER_CACHE_MISSES,
|
||||
enum
|
||||
{
|
||||
SI_QUERY_DRAW_CALLS = PIPE_QUERY_DRIVER_SPECIFIC,
|
||||
SI_QUERY_DECOMPRESS_CALLS,
|
||||
SI_QUERY_MRT_DRAW_CALLS,
|
||||
SI_QUERY_PRIM_RESTART_CALLS,
|
||||
SI_QUERY_SPILL_DRAW_CALLS,
|
||||
SI_QUERY_COMPUTE_CALLS,
|
||||
SI_QUERY_SPILL_COMPUTE_CALLS,
|
||||
SI_QUERY_DMA_CALLS,
|
||||
SI_QUERY_CP_DMA_CALLS,
|
||||
SI_QUERY_NUM_VS_FLUSHES,
|
||||
SI_QUERY_NUM_PS_FLUSHES,
|
||||
SI_QUERY_NUM_CS_FLUSHES,
|
||||
SI_QUERY_NUM_CB_CACHE_FLUSHES,
|
||||
SI_QUERY_NUM_DB_CACHE_FLUSHES,
|
||||
SI_QUERY_NUM_L2_INVALIDATES,
|
||||
SI_QUERY_NUM_L2_WRITEBACKS,
|
||||
SI_QUERY_NUM_RESIDENT_HANDLES,
|
||||
SI_QUERY_TC_OFFLOADED_SLOTS,
|
||||
SI_QUERY_TC_DIRECT_SLOTS,
|
||||
SI_QUERY_TC_NUM_SYNCS,
|
||||
SI_QUERY_CS_THREAD_BUSY,
|
||||
SI_QUERY_GALLIUM_THREAD_BUSY,
|
||||
SI_QUERY_REQUESTED_VRAM,
|
||||
SI_QUERY_REQUESTED_GTT,
|
||||
SI_QUERY_MAPPED_VRAM,
|
||||
SI_QUERY_MAPPED_GTT,
|
||||
SI_QUERY_BUFFER_WAIT_TIME,
|
||||
SI_QUERY_NUM_MAPPED_BUFFERS,
|
||||
SI_QUERY_NUM_GFX_IBS,
|
||||
SI_QUERY_NUM_SDMA_IBS,
|
||||
SI_QUERY_GFX_BO_LIST_SIZE,
|
||||
SI_QUERY_GFX_IB_SIZE,
|
||||
SI_QUERY_NUM_BYTES_MOVED,
|
||||
SI_QUERY_NUM_EVICTIONS,
|
||||
SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS,
|
||||
SI_QUERY_VRAM_USAGE,
|
||||
SI_QUERY_VRAM_VIS_USAGE,
|
||||
SI_QUERY_GTT_USAGE,
|
||||
SI_QUERY_GPU_TEMPERATURE,
|
||||
SI_QUERY_CURRENT_GPU_SCLK,
|
||||
SI_QUERY_CURRENT_GPU_MCLK,
|
||||
SI_QUERY_GPU_LOAD,
|
||||
SI_QUERY_GPU_SHADERS_BUSY,
|
||||
SI_QUERY_GPU_TA_BUSY,
|
||||
SI_QUERY_GPU_GDS_BUSY,
|
||||
SI_QUERY_GPU_VGT_BUSY,
|
||||
SI_QUERY_GPU_IA_BUSY,
|
||||
SI_QUERY_GPU_SX_BUSY,
|
||||
SI_QUERY_GPU_WD_BUSY,
|
||||
SI_QUERY_GPU_BCI_BUSY,
|
||||
SI_QUERY_GPU_SC_BUSY,
|
||||
SI_QUERY_GPU_PA_BUSY,
|
||||
SI_QUERY_GPU_DB_BUSY,
|
||||
SI_QUERY_GPU_CP_BUSY,
|
||||
SI_QUERY_GPU_CB_BUSY,
|
||||
SI_QUERY_GPU_SDMA_BUSY,
|
||||
SI_QUERY_GPU_PFP_BUSY,
|
||||
SI_QUERY_GPU_MEQ_BUSY,
|
||||
SI_QUERY_GPU_ME_BUSY,
|
||||
SI_QUERY_GPU_SURF_SYNC_BUSY,
|
||||
SI_QUERY_GPU_CP_DMA_BUSY,
|
||||
SI_QUERY_GPU_SCRATCH_RAM_BUSY,
|
||||
SI_QUERY_NUM_COMPILATIONS,
|
||||
SI_QUERY_NUM_SHADERS_CREATED,
|
||||
SI_QUERY_BACK_BUFFER_PS_DRAW_RATIO,
|
||||
SI_QUERY_GPIN_ASIC_ID,
|
||||
SI_QUERY_GPIN_NUM_SIMD,
|
||||
SI_QUERY_GPIN_NUM_RB,
|
||||
SI_QUERY_GPIN_NUM_SPI,
|
||||
SI_QUERY_GPIN_NUM_SE,
|
||||
SI_QUERY_TIME_ELAPSED_SDMA,
|
||||
SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
|
||||
SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
|
||||
SI_QUERY_PD_NUM_PRIMS_REJECTED,
|
||||
SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
|
||||
SI_QUERY_LIVE_SHADER_CACHE_HITS,
|
||||
SI_QUERY_LIVE_SHADER_CACHE_MISSES,
|
||||
SI_QUERY_MEMORY_SHADER_CACHE_HITS,
|
||||
SI_QUERY_MEMORY_SHADER_CACHE_MISSES,
|
||||
SI_QUERY_DISK_SHADER_CACHE_HITS,
|
||||
SI_QUERY_DISK_SHADER_CACHE_MISSES,
|
||||
|
||||
SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
|
||||
SI_QUERY_FIRST_PERFCOUNTER = PIPE_QUERY_DRIVER_SPECIFIC + 100,
|
||||
};
|
||||
|
||||
enum {
|
||||
SI_QUERY_GROUP_GPIN = 0,
|
||||
SI_NUM_SW_QUERY_GROUPS
|
||||
enum
|
||||
{
|
||||
SI_QUERY_GROUP_GPIN = 0,
|
||||
SI_NUM_SW_QUERY_GROUPS
|
||||
};
|
||||
|
||||
struct si_query_ops {
|
||||
void (*destroy)(struct si_context *, struct si_query *);
|
||||
bool (*begin)(struct si_context *, struct si_query *);
|
||||
bool (*end)(struct si_context *, struct si_query *);
|
||||
bool (*get_result)(struct si_context *,
|
||||
struct si_query *, bool wait,
|
||||
union pipe_query_result *result);
|
||||
void (*get_result_resource)(struct si_context *,
|
||||
struct si_query *, bool wait,
|
||||
enum pipe_query_value_type result_type,
|
||||
int index,
|
||||
struct pipe_resource *resource,
|
||||
unsigned offset);
|
||||
void (*destroy)(struct si_context *, struct si_query *);
|
||||
bool (*begin)(struct si_context *, struct si_query *);
|
||||
bool (*end)(struct si_context *, struct si_query *);
|
||||
bool (*get_result)(struct si_context *, struct si_query *, bool wait,
|
||||
union pipe_query_result *result);
|
||||
void (*get_result_resource)(struct si_context *, struct si_query *, bool wait,
|
||||
enum pipe_query_value_type result_type, int index,
|
||||
struct pipe_resource *resource, unsigned offset);
|
||||
|
||||
void (*suspend)(struct si_context *, struct si_query *);
|
||||
void (*resume)(struct si_context *, struct si_query *);
|
||||
void (*suspend)(struct si_context *, struct si_query *);
|
||||
void (*resume)(struct si_context *, struct si_query *);
|
||||
};
|
||||
|
||||
struct si_query {
|
||||
struct threaded_query b;
|
||||
const struct si_query_ops *ops;
|
||||
struct threaded_query b;
|
||||
const struct si_query_ops *ops;
|
||||
|
||||
/* The PIPE_QUERY_xxx type of query */
|
||||
unsigned type;
|
||||
/* The PIPE_QUERY_xxx type of query */
|
||||
unsigned type;
|
||||
|
||||
/* The number of dwords for suspend. */
|
||||
unsigned num_cs_dw_suspend;
|
||||
/* The number of dwords for suspend. */
|
||||
unsigned num_cs_dw_suspend;
|
||||
|
||||
/* Linked list of queries that must be suspended at end of CS. */
|
||||
struct list_head active_list;
|
||||
/* Linked list of queries that must be suspended at end of CS. */
|
||||
struct list_head active_list;
|
||||
};
|
||||
|
||||
enum {
|
||||
SI_QUERY_HW_FLAG_NO_START = (1 << 0),
|
||||
/* gap */
|
||||
/* whether begin_query doesn't clear the result */
|
||||
SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
|
||||
enum
|
||||
{
|
||||
SI_QUERY_HW_FLAG_NO_START = (1 << 0),
|
||||
/* gap */
|
||||
/* whether begin_query doesn't clear the result */
|
||||
SI_QUERY_HW_FLAG_BEGIN_RESUMES = (1 << 2),
|
||||
};
|
||||
|
||||
struct si_query_hw_ops {
|
||||
bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
|
||||
void (*emit_start)(struct si_context *,
|
||||
struct si_query_hw *,
|
||||
struct si_resource *buffer, uint64_t va);
|
||||
void (*emit_stop)(struct si_context *,
|
||||
struct si_query_hw *,
|
||||
struct si_resource *buffer, uint64_t va);
|
||||
void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
|
||||
void (*add_result)(struct si_screen *screen,
|
||||
struct si_query_hw *, void *buffer,
|
||||
union pipe_query_result *result);
|
||||
bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *);
|
||||
void (*emit_start)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
|
||||
uint64_t va);
|
||||
void (*emit_stop)(struct si_context *, struct si_query_hw *, struct si_resource *buffer,
|
||||
uint64_t va);
|
||||
void (*clear_result)(struct si_query_hw *, union pipe_query_result *);
|
||||
void (*add_result)(struct si_screen *screen, struct si_query_hw *, void *buffer,
|
||||
union pipe_query_result *result);
|
||||
};
|
||||
|
||||
struct si_query_buffer {
|
||||
/* The buffer where query results are stored. */
|
||||
struct si_resource *buf;
|
||||
/* If a query buffer is full, a new buffer is created and the old one
|
||||
* is put in here. When we calculate the result, we sum up the samples
|
||||
* from all buffers. */
|
||||
struct si_query_buffer *previous;
|
||||
/* Offset of the next free result after current query data */
|
||||
unsigned results_end;
|
||||
bool unprepared;
|
||||
/* The buffer where query results are stored. */
|
||||
struct si_resource *buf;
|
||||
/* If a query buffer is full, a new buffer is created and the old one
|
||||
* is put in here. When we calculate the result, we sum up the samples
|
||||
* from all buffers. */
|
||||
struct si_query_buffer *previous;
|
||||
/* Offset of the next free result after current query data */
|
||||
unsigned results_end;
|
||||
bool unprepared;
|
||||
};
|
||||
|
||||
void si_query_buffer_destroy(struct si_screen *sctx, struct si_query_buffer *buffer);
|
||||
void si_query_buffer_reset(struct si_context *sctx, struct si_query_buffer *buffer);
|
||||
bool si_query_buffer_alloc(struct si_context *sctx, struct si_query_buffer *buffer,
|
||||
bool (*prepare_buffer)(struct si_context *, struct si_query_buffer*),
|
||||
unsigned size);
|
||||
|
||||
bool (*prepare_buffer)(struct si_context *, struct si_query_buffer *),
|
||||
unsigned size);
|
||||
|
||||
struct si_query_hw {
|
||||
struct si_query b;
|
||||
struct si_query_hw_ops *ops;
|
||||
unsigned flags;
|
||||
struct si_query b;
|
||||
struct si_query_hw_ops *ops;
|
||||
unsigned flags;
|
||||
|
||||
/* The query buffer and how many results are in it. */
|
||||
struct si_query_buffer buffer;
|
||||
/* Size of the result in memory for both begin_query and end_query,
|
||||
* this can be one or two numbers, or it could even be a size of a structure. */
|
||||
unsigned result_size;
|
||||
/* For transform feedback: which stream the query is for */
|
||||
unsigned stream;
|
||||
/* The query buffer and how many results are in it. */
|
||||
struct si_query_buffer buffer;
|
||||
/* Size of the result in memory for both begin_query and end_query,
|
||||
* this can be one or two numbers, or it could even be a size of a structure. */
|
||||
unsigned result_size;
|
||||
/* For transform feedback: which stream the query is for */
|
||||
unsigned stream;
|
||||
|
||||
/* Workaround via compute shader */
|
||||
struct si_resource *workaround_buf;
|
||||
unsigned workaround_offset;
|
||||
/* Workaround via compute shader */
|
||||
struct si_resource *workaround_buf;
|
||||
unsigned workaround_offset;
|
||||
};
|
||||
|
||||
void si_query_hw_destroy(struct si_context *sctx,
|
||||
struct si_query *squery);
|
||||
bool si_query_hw_begin(struct si_context *sctx,
|
||||
struct si_query *squery);
|
||||
bool si_query_hw_end(struct si_context *sctx,
|
||||
struct si_query *squery);
|
||||
bool si_query_hw_get_result(struct si_context *sctx,
|
||||
struct si_query *squery,
|
||||
bool wait,
|
||||
union pipe_query_result *result);
|
||||
void si_query_hw_destroy(struct si_context *sctx, struct si_query *squery);
|
||||
bool si_query_hw_begin(struct si_context *sctx, struct si_query *squery);
|
||||
bool si_query_hw_end(struct si_context *sctx, struct si_query *squery);
|
||||
bool si_query_hw_get_result(struct si_context *sctx, struct si_query *squery, bool wait,
|
||||
union pipe_query_result *result);
|
||||
void si_query_hw_suspend(struct si_context *sctx, struct si_query *query);
|
||||
void si_query_hw_resume(struct si_context *sctx, struct si_query *query);
|
||||
|
||||
|
||||
/* Shader-based queries */
|
||||
struct pipe_query *gfx10_sh_query_create(struct si_screen *screen,
|
||||
enum pipe_query_type query_type,
|
||||
unsigned index);
|
||||
|
||||
struct pipe_query *gfx10_sh_query_create(struct si_screen *screen, enum pipe_query_type query_type,
|
||||
unsigned index);
|
||||
|
||||
/* Performance counters */
|
||||
struct si_perfcounters {
|
||||
unsigned num_groups;
|
||||
unsigned num_blocks;
|
||||
struct si_pc_block *blocks;
|
||||
unsigned num_groups;
|
||||
unsigned num_blocks;
|
||||
struct si_pc_block *blocks;
|
||||
|
||||
unsigned num_stop_cs_dwords;
|
||||
unsigned num_instance_cs_dwords;
|
||||
unsigned num_stop_cs_dwords;
|
||||
unsigned num_instance_cs_dwords;
|
||||
|
||||
bool separate_se;
|
||||
bool separate_instance;
|
||||
bool separate_se;
|
||||
bool separate_instance;
|
||||
};
|
||||
|
||||
struct pipe_query *si_create_batch_query(struct pipe_context *ctx,
|
||||
unsigned num_queries,
|
||||
unsigned *query_types);
|
||||
struct pipe_query *si_create_batch_query(struct pipe_context *ctx, unsigned num_queries,
|
||||
unsigned *query_types);
|
||||
|
||||
int si_get_perfcounter_info(struct si_screen *,
|
||||
unsigned index,
|
||||
struct pipe_driver_query_info *info);
|
||||
int si_get_perfcounter_group_info(struct si_screen *,
|
||||
unsigned index,
|
||||
struct pipe_driver_query_group_info *info);
|
||||
int si_get_perfcounter_info(struct si_screen *, unsigned index,
|
||||
struct pipe_driver_query_info *info);
|
||||
int si_get_perfcounter_group_info(struct si_screen *, unsigned index,
|
||||
struct pipe_driver_query_group_info *info);
|
||||
|
||||
struct si_qbo_state {
|
||||
void *saved_compute;
|
||||
struct pipe_constant_buffer saved_const0;
|
||||
struct pipe_shader_buffer saved_ssbo[3];
|
||||
unsigned saved_ssbo_writable_mask;
|
||||
void *saved_compute;
|
||||
struct pipe_constant_buffer saved_const0;
|
||||
struct pipe_shader_buffer saved_ssbo[3];
|
||||
unsigned saved_ssbo_writable_mask;
|
||||
};
|
||||
|
||||
#endif /* SI_QUERY_H */
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -25,8 +25,8 @@
|
|||
#ifndef SI_SHADER_PRIVATE_H
|
||||
#define SI_SHADER_PRIVATE_H
|
||||
|
||||
#include "si_shader.h"
|
||||
#include "ac_shader_abi.h"
|
||||
#include "si_shader.h"
|
||||
|
||||
struct pipe_debug_callback;
|
||||
|
||||
|
|
@ -38,275 +38,245 @@ struct pipe_debug_callback;
|
|||
#define PS_EPILOG_SAMPLEMASK_MIN_LOC 14
|
||||
|
||||
struct si_shader_output_values {
|
||||
LLVMValueRef values[4];
|
||||
unsigned semantic_name;
|
||||
unsigned semantic_index;
|
||||
ubyte vertex_stream[4];
|
||||
LLVMValueRef values[4];
|
||||
unsigned semantic_name;
|
||||
unsigned semantic_index;
|
||||
ubyte vertex_stream[4];
|
||||
};
|
||||
|
||||
struct si_shader_context {
|
||||
struct ac_llvm_context ac;
|
||||
struct si_shader *shader;
|
||||
struct si_screen *screen;
|
||||
struct ac_llvm_context ac;
|
||||
struct si_shader *shader;
|
||||
struct si_screen *screen;
|
||||
|
||||
unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
|
||||
unsigned type; /* PIPE_SHADER_* specifies the type of shader. */
|
||||
|
||||
/* For clamping the non-constant index in resource indexing: */
|
||||
unsigned num_const_buffers;
|
||||
unsigned num_shader_buffers;
|
||||
unsigned num_images;
|
||||
unsigned num_samplers;
|
||||
/* For clamping the non-constant index in resource indexing: */
|
||||
unsigned num_const_buffers;
|
||||
unsigned num_shader_buffers;
|
||||
unsigned num_images;
|
||||
unsigned num_samplers;
|
||||
|
||||
struct ac_shader_args args;
|
||||
struct ac_shader_abi abi;
|
||||
struct ac_shader_args args;
|
||||
struct ac_shader_abi abi;
|
||||
|
||||
LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
|
||||
LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
|
||||
|
||||
LLVMBasicBlockRef merged_wrap_if_entry_block;
|
||||
int merged_wrap_if_label;
|
||||
LLVMBasicBlockRef merged_wrap_if_entry_block;
|
||||
int merged_wrap_if_label;
|
||||
|
||||
LLVMValueRef main_fn;
|
||||
LLVMTypeRef return_type;
|
||||
LLVMValueRef main_fn;
|
||||
LLVMTypeRef return_type;
|
||||
|
||||
struct ac_arg const_and_shader_buffers;
|
||||
struct ac_arg samplers_and_images;
|
||||
struct ac_arg const_and_shader_buffers;
|
||||
struct ac_arg samplers_and_images;
|
||||
|
||||
/* For merged shaders, the per-stage descriptors for the stage other
|
||||
* than the one we're processing, used to pass them through from the
|
||||
* first stage to the second.
|
||||
*/
|
||||
struct ac_arg other_const_and_shader_buffers;
|
||||
struct ac_arg other_samplers_and_images;
|
||||
/* For merged shaders, the per-stage descriptors for the stage other
|
||||
* than the one we're processing, used to pass them through from the
|
||||
* first stage to the second.
|
||||
*/
|
||||
struct ac_arg other_const_and_shader_buffers;
|
||||
struct ac_arg other_samplers_and_images;
|
||||
|
||||
struct ac_arg rw_buffers;
|
||||
struct ac_arg bindless_samplers_and_images;
|
||||
/* Common inputs for merged shaders. */
|
||||
struct ac_arg merged_wave_info;
|
||||
struct ac_arg merged_scratch_offset;
|
||||
struct ac_arg small_prim_cull_info;
|
||||
/* API VS */
|
||||
struct ac_arg vertex_buffers;
|
||||
struct ac_arg vb_descriptors[5];
|
||||
struct ac_arg rel_auto_id;
|
||||
struct ac_arg vs_prim_id;
|
||||
struct ac_arg vertex_index0;
|
||||
/* VS states and layout of LS outputs / TCS inputs at the end
|
||||
* [0] = clamp vertex color
|
||||
* [1] = indexed
|
||||
* [2:3] = NGG: output primitive type
|
||||
* [4:5] = NGG: provoking vertex index
|
||||
* [6] = NGG: streamout queries enabled
|
||||
* [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
|
||||
* but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
|
||||
* Only the first 4 bits of the exponent are stored.
|
||||
* Set it like this: (fui(num_samples / quant_mode) >> 23)
|
||||
* Expand to FP32 like this: ((0x70 | value) << 23);
|
||||
* With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
|
||||
* = 1/2^(15 - value) in FP32
|
||||
* [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
|
||||
* max = 32*32*4 + 32*4
|
||||
* [24:31] = stride between vertices in DW = num_inputs * 4
|
||||
* max = 32*4
|
||||
*/
|
||||
struct ac_arg vs_state_bits;
|
||||
struct ac_arg vs_blit_inputs;
|
||||
struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
|
||||
/* HW VS */
|
||||
struct ac_arg streamout_config;
|
||||
struct ac_arg streamout_write_index;
|
||||
struct ac_arg streamout_offset[4];
|
||||
struct ac_arg rw_buffers;
|
||||
struct ac_arg bindless_samplers_and_images;
|
||||
/* Common inputs for merged shaders. */
|
||||
struct ac_arg merged_wave_info;
|
||||
struct ac_arg merged_scratch_offset;
|
||||
struct ac_arg small_prim_cull_info;
|
||||
/* API VS */
|
||||
struct ac_arg vertex_buffers;
|
||||
struct ac_arg vb_descriptors[5];
|
||||
struct ac_arg rel_auto_id;
|
||||
struct ac_arg vs_prim_id;
|
||||
struct ac_arg vertex_index0;
|
||||
/* VS states and layout of LS outputs / TCS inputs at the end
|
||||
* [0] = clamp vertex color
|
||||
* [1] = indexed
|
||||
* [2:3] = NGG: output primitive type
|
||||
* [4:5] = NGG: provoking vertex index
|
||||
* [6] = NGG: streamout queries enabled
|
||||
* [7:10] = NGG: small prim filter precision = num_samples / quant_mode,
|
||||
* but in reality it's: 1/2^n, from 1/16 to 1/4096 = 1/2^4 to 1/2^12
|
||||
* Only the first 4 bits of the exponent are stored.
|
||||
* Set it like this: (fui(num_samples / quant_mode) >> 23)
|
||||
* Expand to FP32 like this: ((0x70 | value) << 23);
|
||||
* With 0x70 = 112, we get 2^(112 + value - 127) = 2^(value - 15)
|
||||
* = 1/2^(15 - value) in FP32
|
||||
* [11:23] = stride between patches in DW = num_inputs * num_vertices * 4
|
||||
* max = 32*32*4 + 32*4
|
||||
* [24:31] = stride between vertices in DW = num_inputs * 4
|
||||
* max = 32*4
|
||||
*/
|
||||
struct ac_arg vs_state_bits;
|
||||
struct ac_arg vs_blit_inputs;
|
||||
struct ac_arg ngg_old_thread_id; /* generated by the NGG cull shader */
|
||||
/* HW VS */
|
||||
struct ac_arg streamout_config;
|
||||
struct ac_arg streamout_write_index;
|
||||
struct ac_arg streamout_offset[4];
|
||||
|
||||
/* API TCS & TES */
|
||||
/* Layout of TCS outputs in the offchip buffer
|
||||
* # 6 bits
|
||||
* [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
|
||||
* # 6 bits
|
||||
* [6:11] = the number of output vertices per patch, max = 32
|
||||
* # 20 bits
|
||||
* [12:31] = the offset of per patch attributes in the buffer in bytes.
|
||||
* max = NUM_PATCHES*32*32*16
|
||||
*/
|
||||
struct ac_arg tcs_offchip_layout;
|
||||
/* API TCS & TES */
|
||||
/* Layout of TCS outputs in the offchip buffer
|
||||
* # 6 bits
|
||||
* [0:5] = the number of patches per threadgroup, max = NUM_PATCHES (40)
|
||||
* # 6 bits
|
||||
* [6:11] = the number of output vertices per patch, max = 32
|
||||
* # 20 bits
|
||||
* [12:31] = the offset of per patch attributes in the buffer in bytes.
|
||||
* max = NUM_PATCHES*32*32*16
|
||||
*/
|
||||
struct ac_arg tcs_offchip_layout;
|
||||
|
||||
/* API TCS */
|
||||
/* Offsets where TCS outputs and TCS patch outputs live in LDS:
|
||||
* [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
|
||||
* [16:31] = TCS output patch0 offset for per-patch / 16
|
||||
* max = (NUM_PATCHES + 1) * 32*32
|
||||
*/
|
||||
struct ac_arg tcs_out_lds_offsets;
|
||||
/* Layout of TCS outputs / TES inputs:
|
||||
* [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
|
||||
* max = 32*32*4 + 32*4
|
||||
* [13:18] = gl_PatchVerticesIn, max = 32
|
||||
* [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
|
||||
*/
|
||||
struct ac_arg tcs_out_lds_layout;
|
||||
struct ac_arg tcs_offchip_offset;
|
||||
struct ac_arg tcs_factor_offset;
|
||||
/* API TCS */
|
||||
/* Offsets where TCS outputs and TCS patch outputs live in LDS:
|
||||
* [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
|
||||
* [16:31] = TCS output patch0 offset for per-patch / 16
|
||||
* max = (NUM_PATCHES + 1) * 32*32
|
||||
*/
|
||||
struct ac_arg tcs_out_lds_offsets;
|
||||
/* Layout of TCS outputs / TES inputs:
|
||||
* [0:12] = stride between output patches in DW, num_outputs * num_vertices * 4
|
||||
* max = 32*32*4 + 32*4
|
||||
* [13:18] = gl_PatchVerticesIn, max = 32
|
||||
* [19:31] = high 13 bits of the 32-bit address of tessellation ring buffers
|
||||
*/
|
||||
struct ac_arg tcs_out_lds_layout;
|
||||
struct ac_arg tcs_offchip_offset;
|
||||
struct ac_arg tcs_factor_offset;
|
||||
|
||||
/* API TES */
|
||||
struct ac_arg tes_offchip_addr;
|
||||
struct ac_arg tes_u;
|
||||
struct ac_arg tes_v;
|
||||
struct ac_arg tes_rel_patch_id;
|
||||
/* HW ES */
|
||||
struct ac_arg es2gs_offset;
|
||||
/* HW GS */
|
||||
/* On gfx10:
|
||||
* - bits 0..11: ordered_wave_id
|
||||
* - bits 12..20: number of vertices in group
|
||||
* - bits 22..30: number of primitives in group
|
||||
*/
|
||||
struct ac_arg gs_tg_info;
|
||||
/* API GS */
|
||||
struct ac_arg gs2vs_offset;
|
||||
struct ac_arg gs_wave_id; /* GFX6 */
|
||||
struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
|
||||
struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */
|
||||
struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */
|
||||
struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */
|
||||
/* PS */
|
||||
struct ac_arg pos_fixed_pt;
|
||||
/* CS */
|
||||
struct ac_arg block_size;
|
||||
struct ac_arg cs_user_data;
|
||||
/* API TES */
|
||||
struct ac_arg tes_offchip_addr;
|
||||
struct ac_arg tes_u;
|
||||
struct ac_arg tes_v;
|
||||
struct ac_arg tes_rel_patch_id;
|
||||
/* HW ES */
|
||||
struct ac_arg es2gs_offset;
|
||||
/* HW GS */
|
||||
/* On gfx10:
|
||||
* - bits 0..11: ordered_wave_id
|
||||
* - bits 12..20: number of vertices in group
|
||||
* - bits 22..30: number of primitives in group
|
||||
*/
|
||||
struct ac_arg gs_tg_info;
|
||||
/* API GS */
|
||||
struct ac_arg gs2vs_offset;
|
||||
struct ac_arg gs_wave_id; /* GFX6 */
|
||||
struct ac_arg gs_vtx_offset[6]; /* in dwords (GFX6) */
|
||||
struct ac_arg gs_vtx01_offset; /* in dwords (GFX9) */
|
||||
struct ac_arg gs_vtx23_offset; /* in dwords (GFX9) */
|
||||
struct ac_arg gs_vtx45_offset; /* in dwords (GFX9) */
|
||||
/* PS */
|
||||
struct ac_arg pos_fixed_pt;
|
||||
/* CS */
|
||||
struct ac_arg block_size;
|
||||
struct ac_arg cs_user_data;
|
||||
|
||||
struct ac_llvm_compiler *compiler;
|
||||
struct ac_llvm_compiler *compiler;
|
||||
|
||||
/* Preloaded descriptors. */
|
||||
LLVMValueRef esgs_ring;
|
||||
LLVMValueRef gsvs_ring[4];
|
||||
LLVMValueRef tess_offchip_ring;
|
||||
/* Preloaded descriptors. */
|
||||
LLVMValueRef esgs_ring;
|
||||
LLVMValueRef gsvs_ring[4];
|
||||
LLVMValueRef tess_offchip_ring;
|
||||
|
||||
LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
|
||||
LLVMValueRef gs_next_vertex[4];
|
||||
LLVMValueRef gs_curprim_verts[4];
|
||||
LLVMValueRef gs_generated_prims[4];
|
||||
LLVMValueRef gs_ngg_emit;
|
||||
LLVMValueRef gs_ngg_scratch;
|
||||
LLVMValueRef postponed_kill;
|
||||
LLVMValueRef return_value;
|
||||
LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
|
||||
LLVMValueRef gs_next_vertex[4];
|
||||
LLVMValueRef gs_curprim_verts[4];
|
||||
LLVMValueRef gs_generated_prims[4];
|
||||
LLVMValueRef gs_ngg_emit;
|
||||
LLVMValueRef gs_ngg_scratch;
|
||||
LLVMValueRef postponed_kill;
|
||||
LLVMValueRef return_value;
|
||||
};
|
||||
|
||||
static inline struct si_shader_context *
|
||||
si_shader_context_from_abi(struct ac_shader_abi *abi)
|
||||
static inline struct si_shader_context *si_shader_context_from_abi(struct ac_shader_abi *abi)
|
||||
{
|
||||
struct si_shader_context *ctx = NULL;
|
||||
return container_of(abi, ctx, abi);
|
||||
struct si_shader_context *ctx = NULL;
|
||||
return container_of(abi, ctx, abi);
|
||||
}
|
||||
|
||||
bool si_is_multi_part_shader(struct si_shader *shader);
|
||||
bool si_is_merged_shader(struct si_shader *shader);
|
||||
void si_add_arg_checked(struct ac_shader_args *args,
|
||||
enum ac_arg_regfile file,
|
||||
unsigned registers, enum ac_arg_type type,
|
||||
struct ac_arg *arg,
|
||||
unsigned idx);
|
||||
void si_add_arg_checked(struct ac_shader_args *args, enum ac_arg_regfile file, unsigned registers,
|
||||
enum ac_arg_type type, struct ac_arg *arg, unsigned idx);
|
||||
unsigned si_get_max_workgroup_size(const struct si_shader *shader);
|
||||
bool si_need_ps_prolog(const union si_shader_part_key *key);
|
||||
void si_get_ps_prolog_key(struct si_shader *shader,
|
||||
union si_shader_part_key *key,
|
||||
bool separate_prolog);
|
||||
void si_get_ps_epilog_key(struct si_shader *shader,
|
||||
union si_shader_part_key *key);
|
||||
void si_get_ps_prolog_key(struct si_shader *shader, union si_shader_part_key *key,
|
||||
bool separate_prolog);
|
||||
void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *key);
|
||||
void si_fix_resource_usage(struct si_screen *sscreen, struct si_shader *shader);
|
||||
void si_create_function(struct si_shader_context *ctx, bool ngg_cull_shader);
|
||||
|
||||
bool gfx10_ngg_export_prim_early(struct si_shader *shader);
|
||||
void gfx10_ngg_build_sendmsg_gs_alloc_req(struct si_shader_context *ctx);
|
||||
void gfx10_ngg_build_export_prim(struct si_shader_context *ctx,
|
||||
LLVMValueRef user_edgeflags[3],
|
||||
LLVMValueRef prim_passthrough);
|
||||
void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi,
|
||||
unsigned max_outputs,
|
||||
LLVMValueRef *addrs);
|
||||
void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi,
|
||||
unsigned max_outputs,
|
||||
LLVMValueRef *addrs);
|
||||
void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx,
|
||||
unsigned stream,
|
||||
LLVMValueRef *addrs);
|
||||
void gfx10_ngg_build_export_prim(struct si_shader_context *ctx, LLVMValueRef user_edgeflags[3],
|
||||
LLVMValueRef prim_passthrough);
|
||||
void gfx10_emit_ngg_culling_epilogue_4x_wave32(struct ac_shader_abi *abi, unsigned max_outputs,
|
||||
LLVMValueRef *addrs);
|
||||
void gfx10_emit_ngg_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
|
||||
void gfx10_ngg_gs_emit_vertex(struct si_shader_context *ctx, unsigned stream, LLVMValueRef *addrs);
|
||||
void gfx10_ngg_gs_emit_prologue(struct si_shader_context *ctx);
|
||||
void gfx10_ngg_gs_emit_epilogue(struct si_shader_context *ctx);
|
||||
void gfx10_ngg_calculate_subgroup_info(struct si_shader *shader);
|
||||
|
||||
/* si_shader_llvm.c */
|
||||
bool si_compile_llvm(struct si_screen *sscreen,
|
||||
struct si_shader_binary *binary,
|
||||
struct ac_shader_config *conf,
|
||||
struct ac_llvm_compiler *compiler,
|
||||
struct ac_llvm_context *ac,
|
||||
struct pipe_debug_callback *debug,
|
||||
enum pipe_shader_type shader_type,
|
||||
const char *name,
|
||||
bool less_optimized);
|
||||
void si_llvm_context_init(struct si_shader_context *ctx,
|
||||
struct si_screen *sscreen,
|
||||
struct ac_llvm_compiler *compiler,
|
||||
unsigned wave_size);
|
||||
void si_llvm_create_func(struct si_shader_context *ctx, const char *name,
|
||||
LLVMTypeRef *return_types, unsigned num_return_elems,
|
||||
unsigned max_workgroup_size);
|
||||
bool si_compile_llvm(struct si_screen *sscreen, struct si_shader_binary *binary,
|
||||
struct ac_shader_config *conf, struct ac_llvm_compiler *compiler,
|
||||
struct ac_llvm_context *ac, struct pipe_debug_callback *debug,
|
||||
enum pipe_shader_type shader_type, const char *name, bool less_optimized);
|
||||
void si_llvm_context_init(struct si_shader_context *ctx, struct si_screen *sscreen,
|
||||
struct ac_llvm_compiler *compiler, unsigned wave_size);
|
||||
void si_llvm_create_func(struct si_shader_context *ctx, const char *name, LLVMTypeRef *return_types,
|
||||
unsigned num_return_elems, unsigned max_workgroup_size);
|
||||
void si_llvm_optimize_module(struct si_shader_context *ctx);
|
||||
void si_llvm_dispose(struct si_shader_context *ctx);
|
||||
LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx,
|
||||
LLVMValueRef resource, LLVMValueRef offset);
|
||||
LLVMValueRef si_buffer_load_const(struct si_shader_context *ctx, LLVMValueRef resource,
|
||||
LLVMValueRef offset);
|
||||
void si_llvm_build_ret(struct si_shader_context *ctx, LLVMValueRef ret);
|
||||
LLVMValueRef si_insert_input_ret(struct si_shader_context *ctx, LLVMValueRef ret,
|
||||
struct ac_arg param, unsigned return_index);
|
||||
struct ac_arg param, unsigned return_index);
|
||||
LLVMValueRef si_insert_input_ret_float(struct si_shader_context *ctx, LLVMValueRef ret,
|
||||
struct ac_arg param, unsigned return_index);
|
||||
struct ac_arg param, unsigned return_index);
|
||||
LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret,
|
||||
struct ac_arg param, unsigned return_index);
|
||||
struct ac_arg param, unsigned return_index);
|
||||
LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx);
|
||||
LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx,
|
||||
LLVMTypeRef type, LLVMValueRef val1,
|
||||
LLVMValueRef val2);
|
||||
LLVMValueRef si_build_gather_64bit(struct si_shader_context *ctx, LLVMTypeRef type,
|
||||
LLVMValueRef val1, LLVMValueRef val2);
|
||||
void si_llvm_emit_barrier(struct si_shader_context *ctx);
|
||||
void si_llvm_declare_esgs_ring(struct si_shader_context *ctx);
|
||||
void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
|
||||
unsigned bitoffset);
|
||||
LLVMValueRef si_unpack_param(struct si_shader_context *ctx,
|
||||
struct ac_arg param, unsigned rshift,
|
||||
unsigned bitwidth);
|
||||
LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx,
|
||||
unsigned swizzle);
|
||||
unsigned bitoffset);
|
||||
LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
|
||||
unsigned bitwidth);
|
||||
LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle);
|
||||
LLVMValueRef si_llvm_get_block_size(struct ac_shader_abi *abi);
|
||||
void si_llvm_declare_compute_memory(struct si_shader_context *ctx);
|
||||
bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir);
|
||||
void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *parts,
|
||||
unsigned num_parts, unsigned main_part,
|
||||
unsigned next_shader_first_part);
|
||||
unsigned num_parts, unsigned main_part,
|
||||
unsigned next_shader_first_part);
|
||||
|
||||
/* si_shader_llvm_gs.c */
|
||||
LLVMValueRef si_is_es_thread(struct si_shader_context *ctx);
|
||||
LLVMValueRef si_is_gs_thread(struct si_shader_context *ctx);
|
||||
void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
|
||||
LLVMValueRef *addrs);
|
||||
void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
|
||||
void si_preload_esgs_ring(struct si_shader_context *ctx);
|
||||
void si_preload_gs_rings(struct si_shader_context *ctx);
|
||||
void si_llvm_build_gs_prolog(struct si_shader_context *ctx,
|
||||
union si_shader_part_key *key);
|
||||
void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
|
||||
void si_llvm_init_gs_callbacks(struct si_shader_context *ctx);
|
||||
|
||||
/* si_shader_llvm_tess.c */
|
||||
void si_llvm_preload_tes_rings(struct si_shader_context *ctx);
|
||||
void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
|
||||
LLVMValueRef *addrs);
|
||||
void si_llvm_build_tcs_epilog(struct si_shader_context *ctx,
|
||||
union si_shader_part_key *key);
|
||||
void si_llvm_emit_ls_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
|
||||
void si_llvm_build_tcs_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
|
||||
void si_llvm_init_tcs_callbacks(struct si_shader_context *ctx);
|
||||
void si_llvm_init_tes_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
|
||||
|
||||
/* si_shader_llvm_ps.c */
|
||||
LLVMValueRef si_get_sample_id(struct si_shader_context *ctx);
|
||||
void si_llvm_build_ps_prolog(struct si_shader_context *ctx,
|
||||
union si_shader_part_key *key);
|
||||
void si_llvm_build_ps_epilog(struct si_shader_context *ctx,
|
||||
union si_shader_part_key *key);
|
||||
void si_llvm_build_monolithic_ps(struct si_shader_context *ctx,
|
||||
struct si_shader *shader);
|
||||
void si_llvm_build_ps_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
|
||||
void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part_key *key);
|
||||
void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader *shader);
|
||||
void si_llvm_init_ps_callbacks(struct si_shader_context *ctx);
|
||||
|
||||
/* si_shader_llvm_resources.c */
|
||||
|
|
@ -314,21 +284,16 @@ void si_llvm_init_resource_callbacks(struct si_shader_context *ctx);
|
|||
|
||||
/* si_shader_llvm_vs.c */
|
||||
void si_llvm_load_vs_inputs(struct si_shader_context *ctx, struct nir_shader *nir);
|
||||
void si_llvm_streamout_store_output(struct si_shader_context *ctx,
|
||||
LLVMValueRef const *so_buffers,
|
||||
LLVMValueRef const *so_write_offsets,
|
||||
struct pipe_stream_output *stream_out,
|
||||
struct si_shader_output_values *shader_out);
|
||||
void si_llvm_emit_streamout(struct si_shader_context *ctx,
|
||||
struct si_shader_output_values *outputs,
|
||||
unsigned noutput, unsigned stream);
|
||||
void si_llvm_streamout_store_output(struct si_shader_context *ctx, LLVMValueRef const *so_buffers,
|
||||
LLVMValueRef const *so_write_offsets,
|
||||
struct pipe_stream_output *stream_out,
|
||||
struct si_shader_output_values *shader_out);
|
||||
void si_llvm_emit_streamout(struct si_shader_context *ctx, struct si_shader_output_values *outputs,
|
||||
unsigned noutput, unsigned stream);
|
||||
void si_llvm_build_vs_exports(struct si_shader_context *ctx,
|
||||
struct si_shader_output_values *outputs,
|
||||
unsigned noutput);
|
||||
void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs,
|
||||
LLVMValueRef *addrs);
|
||||
void si_llvm_build_vs_prolog(struct si_shader_context *ctx,
|
||||
union si_shader_part_key *key);
|
||||
struct si_shader_output_values *outputs, unsigned noutput);
|
||||
void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi, unsigned max_outputs, LLVMValueRef *addrs);
|
||||
void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part_key *key);
|
||||
void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shader);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -22,111 +22,98 @@
|
|||
* USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "si_shader_internal.h"
|
||||
#include "si_pipe.h"
|
||||
#include "si_shader_internal.h"
|
||||
#include "sid.h"
|
||||
|
||||
/**
|
||||
* Return a value that is equal to the given i32 \p index if it lies in [0,num)
|
||||
* or an undefined value in the same interval otherwise.
|
||||
*/
|
||||
static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx,
|
||||
LLVMValueRef index,
|
||||
unsigned num)
|
||||
static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index,
|
||||
unsigned num)
|
||||
{
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
|
||||
LLVMValueRef cc;
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
|
||||
LLVMValueRef cc;
|
||||
|
||||
if (util_is_power_of_two_or_zero(num)) {
|
||||
index = LLVMBuildAnd(builder, index, c_max, "");
|
||||
} else {
|
||||
/* In theory, this MAX pattern should result in code that is
|
||||
* as good as the bit-wise AND above.
|
||||
*
|
||||
* In practice, LLVM generates worse code (at the time of
|
||||
* writing), because its value tracking is not strong enough.
|
||||
*/
|
||||
cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
|
||||
index = LLVMBuildSelect(builder, cc, index, c_max, "");
|
||||
}
|
||||
if (util_is_power_of_two_or_zero(num)) {
|
||||
index = LLVMBuildAnd(builder, index, c_max, "");
|
||||
} else {
|
||||
/* In theory, this MAX pattern should result in code that is
|
||||
* as good as the bit-wise AND above.
|
||||
*
|
||||
* In practice, LLVM generates worse code (at the time of
|
||||
* writing), because its value tracking is not strong enough.
|
||||
*/
|
||||
cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
|
||||
index = LLVMBuildSelect(builder, cc, index, c_max, "");
|
||||
}
|
||||
|
||||
return index;
|
||||
return index;
|
||||
}
|
||||
|
||||
static LLVMValueRef load_const_buffer_desc_fast_path(struct si_shader_context *ctx)
|
||||
{
|
||||
LLVMValueRef ptr =
|
||||
ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
|
||||
struct si_shader_selector *sel = ctx->shader->selector;
|
||||
LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
|
||||
struct si_shader_selector *sel = ctx->shader->selector;
|
||||
|
||||
/* Do the bounds checking with a descriptor, because
|
||||
* doing computation and manual bounds checking of 64-bit
|
||||
* addresses generates horrible VALU code with very high
|
||||
* VGPR usage and very low SIMD occupancy.
|
||||
*/
|
||||
ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
|
||||
/* Do the bounds checking with a descriptor, because
|
||||
* doing computation and manual bounds checking of 64-bit
|
||||
* addresses generates horrible VALU code with very high
|
||||
* VGPR usage and very low SIMD occupancy.
|
||||
*/
|
||||
ptr = LLVMBuildPtrToInt(ctx->ac.builder, ptr, ctx->ac.intptr, "");
|
||||
|
||||
LLVMValueRef desc0, desc1;
|
||||
desc0 = ptr;
|
||||
desc1 = LLVMConstInt(ctx->ac.i32,
|
||||
S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
|
||||
LLVMValueRef desc0, desc1;
|
||||
desc0 = ptr;
|
||||
desc1 = LLVMConstInt(ctx->ac.i32, S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0);
|
||||
|
||||
uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
|
||||
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
||||
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
|
||||
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
|
||||
uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
|
||||
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
|
||||
|
||||
if (ctx->screen->info.chip_class >= GFX10)
|
||||
rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
|
||||
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
|
||||
S_008F0C_RESOURCE_LEVEL(1);
|
||||
else
|
||||
rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
|
||||
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
|
||||
if (ctx->screen->info.chip_class >= GFX10)
|
||||
rsrc3 |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
|
||||
S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
|
||||
else
|
||||
rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
|
||||
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
|
||||
|
||||
LLVMValueRef desc_elems[] = {
|
||||
desc0,
|
||||
desc1,
|
||||
LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
|
||||
LLVMConstInt(ctx->ac.i32, rsrc3, false)
|
||||
};
|
||||
LLVMValueRef desc_elems[] = {desc0, desc1,
|
||||
LLVMConstInt(ctx->ac.i32, sel->info.constbuf0_num_slots * 16, 0),
|
||||
LLVMConstInt(ctx->ac.i32, rsrc3, false)};
|
||||
|
||||
return ac_build_gather_values(&ctx->ac, desc_elems, 4);
|
||||
return ac_build_gather_values(&ctx->ac, desc_elems, 4);
|
||||
}
|
||||
|
||||
static LLVMValueRef load_ubo(struct ac_shader_abi *abi, LLVMValueRef index)
|
||||
{
|
||||
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
|
||||
struct si_shader_selector *sel = ctx->shader->selector;
|
||||
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
|
||||
struct si_shader_selector *sel = ctx->shader->selector;
|
||||
|
||||
LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
|
||||
LLVMValueRef ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
|
||||
|
||||
if (sel->info.const_buffers_declared == 1 &&
|
||||
sel->info.shader_buffers_declared == 0) {
|
||||
return load_const_buffer_desc_fast_path(ctx);
|
||||
}
|
||||
if (sel->info.const_buffers_declared == 1 && sel->info.shader_buffers_declared == 0) {
|
||||
return load_const_buffer_desc_fast_path(ctx);
|
||||
}
|
||||
|
||||
index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
|
||||
index = LLVMBuildAdd(ctx->ac.builder, index,
|
||||
LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
|
||||
index = si_llvm_bound_index(ctx, index, ctx->num_const_buffers);
|
||||
index =
|
||||
LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS, 0), "");
|
||||
|
||||
return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
|
||||
return ac_build_load_to_sgpr(&ctx->ac, ptr, index);
|
||||
}
|
||||
|
||||
static LLVMValueRef
|
||||
load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
|
||||
static LLVMValueRef load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
|
||||
{
|
||||
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
|
||||
LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac,
|
||||
ctx->const_and_shader_buffers);
|
||||
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
|
||||
LLVMValueRef rsrc_ptr = ac_get_arg(&ctx->ac, ctx->const_and_shader_buffers);
|
||||
|
||||
index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
|
||||
index = LLVMBuildSub(ctx->ac.builder,
|
||||
LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
|
||||
index, "");
|
||||
index = si_llvm_bound_index(ctx, index, ctx->num_shader_buffers);
|
||||
index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_SHADER_BUFFERS - 1, 0),
|
||||
index, "");
|
||||
|
||||
return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
|
||||
return ac_build_load_to_sgpr(&ctx->ac, rsrc_ptr, index);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -140,181 +127,167 @@ load_ssbo(struct ac_shader_abi *abi, LLVMValueRef index, bool write)
|
|||
* nicer: disabling DCC in the shader still leads to undefined results but
|
||||
* avoids the lockup.
|
||||
*/
|
||||
static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
|
||||
LLVMValueRef rsrc)
|
||||
static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
|
||||
{
|
||||
if (ctx->screen->info.chip_class <= GFX7) {
|
||||
return rsrc;
|
||||
} else {
|
||||
LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
|
||||
LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
|
||||
LLVMValueRef tmp;
|
||||
if (ctx->screen->info.chip_class <= GFX7) {
|
||||
return rsrc;
|
||||
} else {
|
||||
LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
|
||||
LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
|
||||
LLVMValueRef tmp;
|
||||
|
||||
tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
|
||||
tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
|
||||
return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
|
||||
}
|
||||
tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
|
||||
tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
|
||||
return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
|
||||
}
|
||||
}
|
||||
|
||||
/* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
|
||||
* adjust "index" to point to FMASK. */
|
||||
static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
|
||||
LLVMValueRef list, LLVMValueRef index,
|
||||
enum ac_descriptor_type desc_type,
|
||||
bool uses_store, bool bindless)
|
||||
static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, LLVMValueRef list,
|
||||
LLVMValueRef index, enum ac_descriptor_type desc_type,
|
||||
bool uses_store, bool bindless)
|
||||
{
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
LLVMValueRef rsrc;
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
LLVMValueRef rsrc;
|
||||
|
||||
if (desc_type == AC_DESC_BUFFER) {
|
||||
index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
|
||||
ctx->ac.i32_1);
|
||||
list = LLVMBuildPointerCast(builder, list,
|
||||
ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
|
||||
} else {
|
||||
assert(desc_type == AC_DESC_IMAGE ||
|
||||
desc_type == AC_DESC_FMASK);
|
||||
}
|
||||
if (desc_type == AC_DESC_BUFFER) {
|
||||
index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
|
||||
list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
|
||||
} else {
|
||||
assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
|
||||
}
|
||||
|
||||
if (bindless)
|
||||
rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
|
||||
else
|
||||
rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
|
||||
if (bindless)
|
||||
rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
|
||||
else
|
||||
rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
|
||||
|
||||
if (desc_type == AC_DESC_IMAGE && uses_store)
|
||||
rsrc = force_dcc_off(ctx, rsrc);
|
||||
return rsrc;
|
||||
if (desc_type == AC_DESC_IMAGE && uses_store)
|
||||
rsrc = force_dcc_off(ctx, rsrc);
|
||||
return rsrc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load an image view, fmask view. or sampler state descriptor.
|
||||
*/
|
||||
static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
|
||||
LLVMValueRef list, LLVMValueRef index,
|
||||
enum ac_descriptor_type type)
|
||||
static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, LLVMValueRef list,
|
||||
LLVMValueRef index, enum ac_descriptor_type type)
|
||||
{
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
|
||||
switch (type) {
|
||||
case AC_DESC_IMAGE:
|
||||
/* The image is at [0:7]. */
|
||||
index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
|
||||
break;
|
||||
case AC_DESC_BUFFER:
|
||||
/* The buffer is in [4:7]. */
|
||||
index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
|
||||
ctx->ac.i32_1);
|
||||
list = LLVMBuildPointerCast(builder, list,
|
||||
ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
|
||||
break;
|
||||
case AC_DESC_FMASK:
|
||||
/* The FMASK is at [8:15]. */
|
||||
index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0),
|
||||
ctx->ac.i32_1);
|
||||
break;
|
||||
case AC_DESC_SAMPLER:
|
||||
/* The sampler state is at [12:15]. */
|
||||
index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
|
||||
LLVMConstInt(ctx->ac.i32, 3, 0));
|
||||
list = LLVMBuildPointerCast(builder, list,
|
||||
ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
|
||||
break;
|
||||
case AC_DESC_PLANE_0:
|
||||
case AC_DESC_PLANE_1:
|
||||
case AC_DESC_PLANE_2:
|
||||
/* Only used for the multiplane image support for Vulkan. Should
|
||||
* never be reached in radeonsi.
|
||||
*/
|
||||
unreachable("Plane descriptor requested in radeonsi.");
|
||||
}
|
||||
switch (type) {
|
||||
case AC_DESC_IMAGE:
|
||||
/* The image is at [0:7]. */
|
||||
index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
|
||||
break;
|
||||
case AC_DESC_BUFFER:
|
||||
/* The buffer is in [4:7]. */
|
||||
index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
|
||||
list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
|
||||
break;
|
||||
case AC_DESC_FMASK:
|
||||
/* The FMASK is at [8:15]. */
|
||||
index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
|
||||
break;
|
||||
case AC_DESC_SAMPLER:
|
||||
/* The sampler state is at [12:15]. */
|
||||
index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
|
||||
LLVMConstInt(ctx->ac.i32, 3, 0));
|
||||
list = LLVMBuildPointerCast(builder, list, ac_array_in_const32_addr_space(ctx->ac.v4i32), "");
|
||||
break;
|
||||
case AC_DESC_PLANE_0:
|
||||
case AC_DESC_PLANE_1:
|
||||
case AC_DESC_PLANE_2:
|
||||
/* Only used for the multiplane image support for Vulkan. Should
|
||||
* never be reached in radeonsi.
|
||||
*/
|
||||
unreachable("Plane descriptor requested in radeonsi.");
|
||||
}
|
||||
|
||||
return ac_build_load_to_sgpr(&ctx->ac, list, index);
|
||||
return ac_build_load_to_sgpr(&ctx->ac, list, index);
|
||||
}
|
||||
|
||||
static LLVMValueRef
|
||||
si_nir_load_sampler_desc(struct ac_shader_abi *abi,
|
||||
unsigned descriptor_set, unsigned base_index,
|
||||
unsigned constant_index, LLVMValueRef dynamic_index,
|
||||
enum ac_descriptor_type desc_type, bool image,
|
||||
bool write, bool bindless)
|
||||
static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,
|
||||
unsigned base_index, unsigned constant_index,
|
||||
LLVMValueRef dynamic_index,
|
||||
enum ac_descriptor_type desc_type, bool image,
|
||||
bool write, bool bindless)
|
||||
{
|
||||
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
unsigned const_index = base_index + constant_index;
|
||||
struct si_shader_context *ctx = si_shader_context_from_abi(abi);
|
||||
LLVMBuilderRef builder = ctx->ac.builder;
|
||||
unsigned const_index = base_index + constant_index;
|
||||
|
||||
assert(!descriptor_set);
|
||||
assert(desc_type <= AC_DESC_BUFFER);
|
||||
assert(!descriptor_set);
|
||||
assert(desc_type <= AC_DESC_BUFFER);
|
||||
|
||||
if (bindless) {
|
||||
LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
|
||||
if (bindless) {
|
||||
LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->bindless_samplers_and_images);
|
||||
|
||||
/* dynamic_index is the bindless handle */
|
||||
if (image) {
|
||||
/* Bindless image descriptors use 16-dword slots. */
|
||||
dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
|
||||
LLVMConstInt(ctx->ac.i64, 2, 0), "");
|
||||
/* FMASK is right after the image. */
|
||||
if (desc_type == AC_DESC_FMASK) {
|
||||
dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index,
|
||||
ctx->ac.i32_1, "");
|
||||
}
|
||||
/* dynamic_index is the bindless handle */
|
||||
if (image) {
|
||||
/* Bindless image descriptors use 16-dword slots. */
|
||||
dynamic_index =
|
||||
LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
|
||||
/* FMASK is right after the image. */
|
||||
if (desc_type == AC_DESC_FMASK) {
|
||||
dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, "");
|
||||
}
|
||||
|
||||
return si_load_image_desc(ctx, list, dynamic_index, desc_type,
|
||||
write, true);
|
||||
}
|
||||
return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true);
|
||||
}
|
||||
|
||||
/* Since bindless handle arithmetic can contain an unsigned integer
|
||||
* wraparound and si_load_sampler_desc assumes there isn't any,
|
||||
* use GEP without "inbounds" (inside ac_build_pointer_add)
|
||||
* to prevent incorrect code generation and hangs.
|
||||
*/
|
||||
dynamic_index = LLVMBuildMul(ctx->ac.builder, dynamic_index,
|
||||
LLVMConstInt(ctx->ac.i64, 2, 0), "");
|
||||
list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
|
||||
return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
|
||||
}
|
||||
/* Since bindless handle arithmetic can contain an unsigned integer
|
||||
* wraparound and si_load_sampler_desc assumes there isn't any,
|
||||
* use GEP without "inbounds" (inside ac_build_pointer_add)
|
||||
* to prevent incorrect code generation and hangs.
|
||||
*/
|
||||
dynamic_index =
|
||||
LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
|
||||
list = ac_build_pointer_add(&ctx->ac, list, dynamic_index);
|
||||
return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
|
||||
}
|
||||
|
||||
unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
|
||||
assert(const_index < num_slots || dynamic_index);
|
||||
unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
|
||||
assert(const_index < num_slots || dynamic_index);
|
||||
|
||||
LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
|
||||
LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
|
||||
LLVMValueRef list = ac_get_arg(&ctx->ac, ctx->samplers_and_images);
|
||||
LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
|
||||
|
||||
if (dynamic_index) {
|
||||
index = LLVMBuildAdd(builder, index, dynamic_index, "");
|
||||
if (dynamic_index) {
|
||||
index = LLVMBuildAdd(builder, index, dynamic_index, "");
|
||||
|
||||
/* From the GL_ARB_shader_image_load_store extension spec:
|
||||
*
|
||||
* If a shader performs an image load, store, or atomic
|
||||
* operation using an image variable declared as an array,
|
||||
* and if the index used to select an individual element is
|
||||
* negative or greater than or equal to the size of the
|
||||
* array, the results of the operation are undefined but may
|
||||
* not lead to termination.
|
||||
*/
|
||||
index = si_llvm_bound_index(ctx, index, num_slots);
|
||||
}
|
||||
/* From the GL_ARB_shader_image_load_store extension spec:
|
||||
*
|
||||
* If a shader performs an image load, store, or atomic
|
||||
* operation using an image variable declared as an array,
|
||||
* and if the index used to select an individual element is
|
||||
* negative or greater than or equal to the size of the
|
||||
* array, the results of the operation are undefined but may
|
||||
* not lead to termination.
|
||||
*/
|
||||
index = si_llvm_bound_index(ctx, index, num_slots);
|
||||
}
|
||||
|
||||
if (image) {
|
||||
/* FMASKs are separate from images. */
|
||||
if (desc_type == AC_DESC_FMASK) {
|
||||
index = LLVMBuildAdd(ctx->ac.builder, index,
|
||||
LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
|
||||
}
|
||||
index = LLVMBuildSub(ctx->ac.builder,
|
||||
LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
|
||||
index, "");
|
||||
return si_load_image_desc(ctx, list, index, desc_type, write, false);
|
||||
}
|
||||
if (image) {
|
||||
/* FMASKs are separate from images. */
|
||||
if (desc_type == AC_DESC_FMASK) {
|
||||
index =
|
||||
LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
|
||||
}
|
||||
index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
|
||||
index, "");
|
||||
return si_load_image_desc(ctx, list, index, desc_type, write, false);
|
||||
}
|
||||
|
||||
index = LLVMBuildAdd(ctx->ac.builder, index,
|
||||
LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
|
||||
return si_load_sampler_desc(ctx, list, index, desc_type);
|
||||
index = LLVMBuildAdd(ctx->ac.builder, index,
|
||||
LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
|
||||
return si_load_sampler_desc(ctx, list, index, desc_type);
|
||||
}
|
||||
|
||||
void si_llvm_init_resource_callbacks(struct si_shader_context *ctx)
|
||||
{
|
||||
ctx->abi.load_ubo = load_ubo;
|
||||
ctx->abi.load_ssbo = load_ssbo;
|
||||
ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
|
||||
ctx->abi.load_ubo = load_ubo;
|
||||
ctx->abi.load_ssbo = load_ssbo;
|
||||
ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -25,17 +25,16 @@
|
|||
#include "si_build_pm4.h"
|
||||
|
||||
/* For MSAA sample positions. */
|
||||
#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
|
||||
((((unsigned)(s0x) & 0xf) << 0) | (((unsigned)(s0y) & 0xf) << 4) | \
|
||||
(((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) | \
|
||||
(((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
|
||||
(((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
|
||||
#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y) \
|
||||
((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) | \
|
||||
(((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) | \
|
||||
(((unsigned)(s2y)&0xf) << 20) | (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))
|
||||
|
||||
/* For obtaining location coordinates from registers */
|
||||
#define SEXT4(x) ((int)((x) | ((x) & 0x8 ? 0xfffffff0 : 0)))
|
||||
#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index) * 4)) & 0xf)
|
||||
#define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
|
||||
#define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
|
||||
#define SEXT4(x) ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
|
||||
#define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
|
||||
#define GET_SX(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
|
||||
#define GET_SY(reg, index) GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
|
||||
|
||||
/* The following sample ordering is required by EQAA.
|
||||
*
|
||||
|
|
@ -88,132 +87,128 @@
|
|||
|
||||
/* 1x MSAA */
|
||||
static const uint32_t sample_locs_1x =
|
||||
FILL_SREG( 0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
|
||||
FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0); /* S1, S2, S3 fields are not used by 1x */
|
||||
static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
|
||||
|
||||
/* 2x MSAA (the positions are sorted for EQAA) */
|
||||
static const uint32_t sample_locs_2x =
|
||||
FILL_SREG(-4,-4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
|
||||
FILL_SREG(-4, -4, 4, 4, 0, 0, 0, 0); /* S2 & S3 fields are not used by 2x MSAA */
|
||||
static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
|
||||
|
||||
/* 4x MSAA (the positions are sorted for EQAA) */
|
||||
static const uint32_t sample_locs_4x =
|
||||
FILL_SREG(-2,-6, 2, 6, -6, 2, 6,-2);
|
||||
static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 2, 6, -6, 2, 6, -2);
|
||||
static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
|
||||
|
||||
/* 8x MSAA (the positions are sorted for EQAA) */
|
||||
static const uint32_t sample_locs_8x[] = {
|
||||
FILL_SREG(-3,-5, 5, 1, -1, 3, 7,-7),
|
||||
FILL_SREG(-7,-1, 3, 7, -5, 5, 1,-3),
|
||||
/* The following are unused by hardware, but we emit them to IBs
|
||||
* instead of multiple SET_CONTEXT_REG packets. */
|
||||
0,
|
||||
0,
|
||||
FILL_SREG(-3, -5, 5, 1, -1, 3, 7, -7),
|
||||
FILL_SREG(-7, -1, 3, 7, -5, 5, 1, -3),
|
||||
/* The following are unused by hardware, but we emit them to IBs
|
||||
* instead of multiple SET_CONTEXT_REG packets. */
|
||||
0,
|
||||
0,
|
||||
};
|
||||
static const uint64_t centroid_priority_8x = 0x3546012735460127ull;
|
||||
|
||||
/* 16x MSAA (the positions are sorted for EQAA) */
|
||||
static const uint32_t sample_locs_16x[] = {
|
||||
FILL_SREG(-5,-2, 5, 3, -2, 6, 3,-5),
|
||||
FILL_SREG(-4,-6, 1, 1, -6, 4, 7,-4),
|
||||
FILL_SREG(-1,-3, 6, 7, -3, 2, 0,-7),
|
||||
FILL_SREG(-7,-8, 2, 5, -8, 0, 4,-1),
|
||||
FILL_SREG(-5, -2, 5, 3, -2, 6, 3, -5),
|
||||
FILL_SREG(-4, -6, 1, 1, -6, 4, 7, -4),
|
||||
FILL_SREG(-1, -3, 6, 7, -3, 2, 0, -7),
|
||||
FILL_SREG(-7, -8, 2, 5, -8, 0, 4, -1),
|
||||
};
|
||||
static const uint64_t centroid_priority_16x = 0xc97e64b231d0fa85ull;
|
||||
|
||||
static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
|
||||
unsigned sample_index, float *out_value)
|
||||
unsigned sample_index, float *out_value)
|
||||
{
|
||||
const uint32_t *sample_locs;
|
||||
const uint32_t *sample_locs;
|
||||
|
||||
switch (sample_count) {
|
||||
case 1:
|
||||
default:
|
||||
sample_locs = &sample_locs_1x;
|
||||
break;
|
||||
case 2:
|
||||
sample_locs = &sample_locs_2x;
|
||||
break;
|
||||
case 4:
|
||||
sample_locs = &sample_locs_4x;
|
||||
break;
|
||||
case 8:
|
||||
sample_locs = sample_locs_8x;
|
||||
break;
|
||||
case 16:
|
||||
sample_locs = sample_locs_16x;
|
||||
break;
|
||||
}
|
||||
switch (sample_count) {
|
||||
case 1:
|
||||
default:
|
||||
sample_locs = &sample_locs_1x;
|
||||
break;
|
||||
case 2:
|
||||
sample_locs = &sample_locs_2x;
|
||||
break;
|
||||
case 4:
|
||||
sample_locs = &sample_locs_4x;
|
||||
break;
|
||||
case 8:
|
||||
sample_locs = sample_locs_8x;
|
||||
break;
|
||||
case 16:
|
||||
sample_locs = sample_locs_16x;
|
||||
break;
|
||||
}
|
||||
|
||||
out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
|
||||
out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
|
||||
out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
|
||||
out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
|
||||
}
|
||||
|
||||
static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs,
|
||||
uint64_t centroid_priority,
|
||||
uint32_t sample_locs)
|
||||
static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
|
||||
uint32_t sample_locs)
|
||||
{
|
||||
radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
|
||||
radeon_emit(cs, centroid_priority);
|
||||
radeon_emit(cs, centroid_priority >> 32);
|
||||
radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
|
||||
radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
|
||||
radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
|
||||
radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
|
||||
radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
|
||||
radeon_emit(cs, centroid_priority);
|
||||
radeon_emit(cs, centroid_priority >> 32);
|
||||
radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
|
||||
radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
|
||||
radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
|
||||
radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
|
||||
}
|
||||
|
||||
static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs,
|
||||
uint64_t centroid_priority,
|
||||
const uint32_t *sample_locs,
|
||||
unsigned num_samples)
|
||||
static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs, uint64_t centroid_priority,
|
||||
const uint32_t *sample_locs, unsigned num_samples)
|
||||
{
|
||||
radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
|
||||
radeon_emit(cs, centroid_priority);
|
||||
radeon_emit(cs, centroid_priority >> 32);
|
||||
radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
|
||||
num_samples == 8 ? 14 : 16);
|
||||
radeon_emit_array(cs, sample_locs, 4);
|
||||
radeon_emit_array(cs, sample_locs, 4);
|
||||
radeon_emit_array(cs, sample_locs, 4);
|
||||
radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
|
||||
radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
|
||||
radeon_emit(cs, centroid_priority);
|
||||
radeon_emit(cs, centroid_priority >> 32);
|
||||
radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
|
||||
num_samples == 8 ? 14 : 16);
|
||||
radeon_emit_array(cs, sample_locs, 4);
|
||||
radeon_emit_array(cs, sample_locs, 4);
|
||||
radeon_emit_array(cs, sample_locs, 4);
|
||||
radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
|
||||
}
|
||||
|
||||
void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples)
|
||||
{
|
||||
switch (nr_samples) {
|
||||
default:
|
||||
case 1:
|
||||
si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
|
||||
break;
|
||||
case 2:
|
||||
si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
|
||||
break;
|
||||
case 4:
|
||||
si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
|
||||
break;
|
||||
case 8:
|
||||
si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
|
||||
break;
|
||||
case 16:
|
||||
si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
|
||||
break;
|
||||
}
|
||||
switch (nr_samples) {
|
||||
default:
|
||||
case 1:
|
||||
si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
|
||||
break;
|
||||
case 2:
|
||||
si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
|
||||
break;
|
||||
case 4:
|
||||
si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x);
|
||||
break;
|
||||
case 8:
|
||||
si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_8x, 8);
|
||||
break;
|
||||
case 16:
|
||||
si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_16x, 16);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void si_init_msaa_functions(struct si_context *sctx)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
sctx->b.get_sample_position = si_get_sample_position;
|
||||
sctx->b.get_sample_position = si_get_sample_position;
|
||||
|
||||
si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
|
||||
si_get_sample_position(&sctx->b, 1, 0, sctx->sample_positions.x1[0]);
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
|
||||
for (i = 0; i < 4; i++)
|
||||
si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
|
||||
for (i = 0; i < 8; i++)
|
||||
si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
|
||||
for (i = 0; i < 16; i++)
|
||||
si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
|
||||
for (i = 0; i < 2; i++)
|
||||
si_get_sample_position(&sctx->b, 2, i, sctx->sample_positions.x2[i]);
|
||||
for (i = 0; i < 4; i++)
|
||||
si_get_sample_position(&sctx->b, 4, i, sctx->sample_positions.x4[i]);
|
||||
for (i = 0; i < 8; i++)
|
||||
si_get_sample_position(&sctx->b, 8, i, sctx->sample_positions.x8[i]);
|
||||
for (i = 0; i < 16; i++)
|
||||
si_get_sample_position(&sctx->b, 16, i, sctx->sample_positions.x16[i]);
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -23,395 +23,372 @@
|
|||
*/
|
||||
|
||||
#include "si_build_pm4.h"
|
||||
|
||||
#include "util/u_memory.h"
|
||||
#include "util/u_suballoc.h"
|
||||
|
||||
static void si_set_streamout_enable(struct si_context *sctx, bool enable);
|
||||
|
||||
static inline void si_so_target_reference(struct si_streamout_target **dst,
|
||||
struct pipe_stream_output_target *src)
|
||||
struct pipe_stream_output_target *src)
|
||||
{
|
||||
pipe_so_target_reference((struct pipe_stream_output_target**)dst, src);
|
||||
pipe_so_target_reference((struct pipe_stream_output_target **)dst, src);
|
||||
}
|
||||
|
||||
static struct pipe_stream_output_target *
|
||||
si_create_so_target(struct pipe_context *ctx,
|
||||
struct pipe_resource *buffer,
|
||||
unsigned buffer_offset,
|
||||
unsigned buffer_size)
|
||||
static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx,
|
||||
struct pipe_resource *buffer,
|
||||
unsigned buffer_offset,
|
||||
unsigned buffer_size)
|
||||
{
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
struct si_streamout_target *t;
|
||||
struct si_resource *buf = si_resource(buffer);
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
struct si_streamout_target *t;
|
||||
struct si_resource *buf = si_resource(buffer);
|
||||
|
||||
t = CALLOC_STRUCT(si_streamout_target);
|
||||
if (!t) {
|
||||
return NULL;
|
||||
}
|
||||
t = CALLOC_STRUCT(si_streamout_target);
|
||||
if (!t) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
|
||||
u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
|
||||
&t->buf_filled_size_offset,
|
||||
(struct pipe_resource**)&t->buf_filled_size);
|
||||
if (!t->buf_filled_size) {
|
||||
FREE(t);
|
||||
return NULL;
|
||||
}
|
||||
unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4;
|
||||
u_suballocator_alloc(sctx->allocator_zeroed_memory, buf_filled_size_size, 4,
|
||||
&t->buf_filled_size_offset, (struct pipe_resource **)&t->buf_filled_size);
|
||||
if (!t->buf_filled_size) {
|
||||
FREE(t);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
t->b.reference.count = 1;
|
||||
t->b.context = ctx;
|
||||
pipe_resource_reference(&t->b.buffer, buffer);
|
||||
t->b.buffer_offset = buffer_offset;
|
||||
t->b.buffer_size = buffer_size;
|
||||
t->b.reference.count = 1;
|
||||
t->b.context = ctx;
|
||||
pipe_resource_reference(&t->b.buffer, buffer);
|
||||
t->b.buffer_offset = buffer_offset;
|
||||
t->b.buffer_size = buffer_size;
|
||||
|
||||
util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset,
|
||||
buffer_offset + buffer_size);
|
||||
return &t->b;
|
||||
util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size);
|
||||
return &t->b;
|
||||
}
|
||||
|
||||
static void si_so_target_destroy(struct pipe_context *ctx,
|
||||
struct pipe_stream_output_target *target)
|
||||
static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target)
|
||||
{
|
||||
struct si_streamout_target *t = (struct si_streamout_target*)target;
|
||||
pipe_resource_reference(&t->b.buffer, NULL);
|
||||
si_resource_reference(&t->buf_filled_size, NULL);
|
||||
FREE(t);
|
||||
struct si_streamout_target *t = (struct si_streamout_target *)target;
|
||||
pipe_resource_reference(&t->b.buffer, NULL);
|
||||
si_resource_reference(&t->buf_filled_size, NULL);
|
||||
FREE(t);
|
||||
}
|
||||
|
||||
void si_streamout_buffers_dirty(struct si_context *sctx)
|
||||
{
|
||||
if (!sctx->streamout.enabled_mask)
|
||||
return;
|
||||
if (!sctx->streamout.enabled_mask)
|
||||
return;
|
||||
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
|
||||
si_set_streamout_enable(sctx, true);
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
|
||||
si_set_streamout_enable(sctx, true);
|
||||
}
|
||||
|
||||
static void si_set_streamout_targets(struct pipe_context *ctx,
|
||||
unsigned num_targets,
|
||||
struct pipe_stream_output_target **targets,
|
||||
const unsigned *offsets)
|
||||
static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets,
|
||||
struct pipe_stream_output_target **targets,
|
||||
const unsigned *offsets)
|
||||
{
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
unsigned old_num_targets = sctx->streamout.num_targets;
|
||||
unsigned i;
|
||||
bool wait_now = false;
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
unsigned old_num_targets = sctx->streamout.num_targets;
|
||||
unsigned i;
|
||||
bool wait_now = false;
|
||||
|
||||
/* We are going to unbind the buffers. Mark which caches need to be flushed. */
|
||||
if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
|
||||
/* Since streamout uses vector writes which go through TC L2
|
||||
* and most other clients can use TC L2 as well, we don't need
|
||||
* to flush it.
|
||||
*
|
||||
* The only cases which requires flushing it is VGT DMA index
|
||||
* fetching (on <= GFX7) and indirect draw data, which are rare
|
||||
* cases. Thus, flag the TC L2 dirtiness in the resource and
|
||||
* handle it at draw call time.
|
||||
*/
|
||||
for (i = 0; i < sctx->streamout.num_targets; i++)
|
||||
if (sctx->streamout.targets[i])
|
||||
si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
|
||||
/* We are going to unbind the buffers. Mark which caches need to be flushed. */
|
||||
if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
|
||||
/* Since streamout uses vector writes which go through TC L2
|
||||
* and most other clients can use TC L2 as well, we don't need
|
||||
* to flush it.
|
||||
*
|
||||
* The only cases which requires flushing it is VGT DMA index
|
||||
* fetching (on <= GFX7) and indirect draw data, which are rare
|
||||
* cases. Thus, flag the TC L2 dirtiness in the resource and
|
||||
* handle it at draw call time.
|
||||
*/
|
||||
for (i = 0; i < sctx->streamout.num_targets; i++)
|
||||
if (sctx->streamout.targets[i])
|
||||
si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
|
||||
|
||||
/* Invalidate the scalar cache in case a streamout buffer is
|
||||
* going to be used as a constant buffer.
|
||||
*
|
||||
* Invalidate vL1, because streamout bypasses it (done by
|
||||
* setting GLC=1 in the store instruction), but vL1 in other
|
||||
* CUs can contain outdated data of streamout buffers.
|
||||
*
|
||||
* VS_PARTIAL_FLUSH is required if the buffers are going to be
|
||||
* used as an input immediately.
|
||||
*/
|
||||
sctx->flags |= SI_CONTEXT_INV_SCACHE |
|
||||
SI_CONTEXT_INV_VCACHE;
|
||||
/* Invalidate the scalar cache in case a streamout buffer is
|
||||
* going to be used as a constant buffer.
|
||||
*
|
||||
* Invalidate vL1, because streamout bypasses it (done by
|
||||
* setting GLC=1 in the store instruction), but vL1 in other
|
||||
* CUs can contain outdated data of streamout buffers.
|
||||
*
|
||||
* VS_PARTIAL_FLUSH is required if the buffers are going to be
|
||||
* used as an input immediately.
|
||||
*/
|
||||
sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
|
||||
|
||||
/* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
/* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
|
||||
|
||||
/* Wait now. This is needed to make sure that GDS is not
|
||||
* busy at the end of IBs.
|
||||
*
|
||||
* Also, the next streamout operation will overwrite GDS,
|
||||
* so we need to make sure that it's idle.
|
||||
*/
|
||||
wait_now = true;
|
||||
} else {
|
||||
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
|
||||
}
|
||||
}
|
||||
/* Wait now. This is needed to make sure that GDS is not
|
||||
* busy at the end of IBs.
|
||||
*
|
||||
* Also, the next streamout operation will overwrite GDS,
|
||||
* so we need to make sure that it's idle.
|
||||
*/
|
||||
wait_now = true;
|
||||
} else {
|
||||
sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH;
|
||||
}
|
||||
}
|
||||
|
||||
/* All readers of the streamout targets need to be finished before we can
|
||||
* start writing to the targets.
|
||||
*/
|
||||
if (num_targets) {
|
||||
if (sctx->screen->use_ngg_streamout)
|
||||
si_allocate_gds(sctx);
|
||||
/* All readers of the streamout targets need to be finished before we can
|
||||
* start writing to the targets.
|
||||
*/
|
||||
if (num_targets) {
|
||||
if (sctx->screen->use_ngg_streamout)
|
||||
si_allocate_gds(sctx);
|
||||
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
|
||||
SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
|
||||
/* Streamout buffers must be bound in 2 places:
|
||||
* 1) in VGT by setting the VGT_STRMOUT registers
|
||||
* 2) as shader resources
|
||||
*/
|
||||
/* Streamout buffers must be bound in 2 places:
|
||||
* 1) in VGT by setting the VGT_STRMOUT registers
|
||||
* 2) as shader resources
|
||||
*/
|
||||
|
||||
/* Stop streamout. */
|
||||
if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
|
||||
si_emit_streamout_end(sctx);
|
||||
/* Stop streamout. */
|
||||
if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
|
||||
si_emit_streamout_end(sctx);
|
||||
|
||||
/* Set the new targets. */
|
||||
unsigned enabled_mask = 0, append_bitmask = 0;
|
||||
for (i = 0; i < num_targets; i++) {
|
||||
si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
|
||||
if (!targets[i])
|
||||
continue;
|
||||
/* Set the new targets. */
|
||||
unsigned enabled_mask = 0, append_bitmask = 0;
|
||||
for (i = 0; i < num_targets; i++) {
|
||||
si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
|
||||
if (!targets[i])
|
||||
continue;
|
||||
|
||||
si_context_add_resource_size(sctx, targets[i]->buffer);
|
||||
enabled_mask |= 1 << i;
|
||||
si_context_add_resource_size(sctx, targets[i]->buffer);
|
||||
enabled_mask |= 1 << i;
|
||||
|
||||
if (offsets[i] == ((unsigned)-1))
|
||||
append_bitmask |= 1 << i;
|
||||
}
|
||||
if (offsets[i] == ((unsigned)-1))
|
||||
append_bitmask |= 1 << i;
|
||||
}
|
||||
|
||||
for (; i < sctx->streamout.num_targets; i++)
|
||||
si_so_target_reference(&sctx->streamout.targets[i], NULL);
|
||||
for (; i < sctx->streamout.num_targets; i++)
|
||||
si_so_target_reference(&sctx->streamout.targets[i], NULL);
|
||||
|
||||
sctx->streamout.enabled_mask = enabled_mask;
|
||||
sctx->streamout.num_targets = num_targets;
|
||||
sctx->streamout.append_bitmask = append_bitmask;
|
||||
sctx->streamout.enabled_mask = enabled_mask;
|
||||
sctx->streamout.num_targets = num_targets;
|
||||
sctx->streamout.append_bitmask = append_bitmask;
|
||||
|
||||
/* Update dirty state bits. */
|
||||
if (num_targets) {
|
||||
si_streamout_buffers_dirty(sctx);
|
||||
} else {
|
||||
si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
|
||||
si_set_streamout_enable(sctx, false);
|
||||
}
|
||||
/* Update dirty state bits. */
|
||||
if (num_targets) {
|
||||
si_streamout_buffers_dirty(sctx);
|
||||
} else {
|
||||
si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
|
||||
si_set_streamout_enable(sctx, false);
|
||||
}
|
||||
|
||||
/* Set the shader resources.*/
|
||||
for (i = 0; i < num_targets; i++) {
|
||||
if (targets[i]) {
|
||||
struct pipe_shader_buffer sbuf;
|
||||
sbuf.buffer = targets[i]->buffer;
|
||||
/* Set the shader resources.*/
|
||||
for (i = 0; i < num_targets; i++) {
|
||||
if (targets[i]) {
|
||||
struct pipe_shader_buffer sbuf;
|
||||
sbuf.buffer = targets[i]->buffer;
|
||||
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
sbuf.buffer_offset = targets[i]->buffer_offset;
|
||||
sbuf.buffer_size = targets[i]->buffer_size;
|
||||
} else {
|
||||
sbuf.buffer_offset = 0;
|
||||
sbuf.buffer_size = targets[i]->buffer_offset +
|
||||
targets[i]->buffer_size;
|
||||
}
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
sbuf.buffer_offset = targets[i]->buffer_offset;
|
||||
sbuf.buffer_size = targets[i]->buffer_size;
|
||||
} else {
|
||||
sbuf.buffer_offset = 0;
|
||||
sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size;
|
||||
}
|
||||
|
||||
si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
|
||||
si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
|
||||
} else {
|
||||
si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
|
||||
}
|
||||
}
|
||||
for (; i < old_num_targets; i++)
|
||||
si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
|
||||
si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
|
||||
si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
|
||||
} else {
|
||||
si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
|
||||
}
|
||||
}
|
||||
for (; i < old_num_targets; i++)
|
||||
si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
|
||||
|
||||
if (wait_now)
|
||||
sctx->emit_cache_flush(sctx);
|
||||
if (wait_now)
|
||||
sctx->emit_cache_flush(sctx);
|
||||
}
|
||||
|
||||
static void gfx10_emit_streamout_begin(struct si_context *sctx)
|
||||
{
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
unsigned last_target = 0;
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
unsigned last_target = 0;
|
||||
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (t[i])
|
||||
last_target = i;
|
||||
}
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (t[i])
|
||||
last_target = i;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
|
||||
t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
|
||||
t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i];
|
||||
|
||||
bool append = sctx->streamout.append_bitmask & (1 << i);
|
||||
uint64_t va = 0;
|
||||
bool append = sctx->streamout.append_bitmask & (1 << i);
|
||||
uint64_t va = 0;
|
||||
|
||||
if (append) {
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
|
||||
t[i]->buf_filled_size,
|
||||
RADEON_USAGE_READ,
|
||||
RADEON_PRIO_SO_FILLED_SIZE);
|
||||
if (append) {
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
|
||||
RADEON_PRIO_SO_FILLED_SIZE);
|
||||
|
||||
va = t[i]->buf_filled_size->gpu_address +
|
||||
t[i]->buf_filled_size_offset;
|
||||
}
|
||||
va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
}
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
|
||||
radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
|
||||
S_411_DST_SEL(V_411_GDS) |
|
||||
S_411_CP_SYNC(i == last_target));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, 4 * i); /* destination in GDS */
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) |
|
||||
S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
|
||||
}
|
||||
radeon_emit(cs, PKT3(PKT3_DMA_DATA, 5, 0));
|
||||
radeon_emit(cs, S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) |
|
||||
S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target));
|
||||
radeon_emit(cs, va);
|
||||
radeon_emit(cs, va >> 32);
|
||||
radeon_emit(cs, 4 * i); /* destination in GDS */
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, S_414_BYTE_COUNT_GFX9(4) | S_414_DISABLE_WR_CONFIRM_GFX9(i != last_target));
|
||||
}
|
||||
|
||||
sctx->streamout.begin_emitted = true;
|
||||
sctx->streamout.begin_emitted = true;
|
||||
}
|
||||
|
||||
static void gfx10_emit_streamout_end(struct si_context *sctx)
|
||||
{
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
for (unsigned i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
|
||||
uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
|
||||
si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0,
|
||||
EOP_DST_SEL_TC_L2,
|
||||
EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM,
|
||||
EOP_DATA_SEL_GDS,
|
||||
t[i]->buf_filled_size, va,
|
||||
EOP_DATA_GDS(i, 1), 0);
|
||||
si_cp_release_mem(sctx, sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2,
|
||||
EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS,
|
||||
t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0);
|
||||
|
||||
t[i]->buf_filled_size_valid = true;
|
||||
}
|
||||
t[i]->buf_filled_size_valid = true;
|
||||
}
|
||||
|
||||
sctx->streamout.begin_emitted = false;
|
||||
sctx->streamout.begin_emitted = false;
|
||||
}
|
||||
|
||||
static void si_flush_vgt_streamout(struct si_context *sctx)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
unsigned reg_strmout_cntl;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
unsigned reg_strmout_cntl;
|
||||
|
||||
/* The register is at different places on different ASICs. */
|
||||
if (sctx->chip_class >= GFX7) {
|
||||
reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
|
||||
radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
|
||||
} else {
|
||||
reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
|
||||
radeon_set_config_reg(cs, reg_strmout_cntl, 0);
|
||||
}
|
||||
/* The register is at different places on different ASICs. */
|
||||
if (sctx->chip_class >= GFX7) {
|
||||
reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
|
||||
radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
|
||||
} else {
|
||||
reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
|
||||
radeon_set_config_reg(cs, reg_strmout_cntl, 0);
|
||||
}
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
|
||||
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
|
||||
radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
|
||||
|
||||
radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
|
||||
radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
|
||||
radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
|
||||
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
|
||||
radeon_emit(cs, 4); /* poll interval */
|
||||
radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
|
||||
radeon_emit(cs,
|
||||
WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
|
||||
radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
|
||||
radeon_emit(cs, 0);
|
||||
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
|
||||
radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
|
||||
radeon_emit(cs, 4); /* poll interval */
|
||||
}
|
||||
|
||||
static void si_emit_streamout_begin(struct si_context *sctx)
|
||||
{
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
|
||||
unsigned i;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
|
||||
unsigned i;
|
||||
|
||||
si_flush_vgt_streamout(sctx);
|
||||
si_flush_vgt_streamout(sctx);
|
||||
|
||||
for (i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
for (i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
|
||||
t[i]->stride_in_dw = stride_in_dw[i];
|
||||
t[i]->stride_in_dw = stride_in_dw[i];
|
||||
|
||||
/* AMD GCN binds streamout buffers as shader resources.
|
||||
* VGT only counts primitives and tells the shader
|
||||
* through SGPRs what to do. */
|
||||
radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
|
||||
radeon_emit(cs, (t[i]->b.buffer_offset +
|
||||
t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
|
||||
radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */
|
||||
/* AMD GCN binds streamout buffers as shader resources.
|
||||
* VGT only counts primitives and tells the shader
|
||||
* through SGPRs what to do. */
|
||||
radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2);
|
||||
radeon_emit(cs, (t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */
|
||||
radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */
|
||||
|
||||
if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
|
||||
uint64_t va = t[i]->buf_filled_size->gpu_address +
|
||||
t[i]->buf_filled_size_offset;
|
||||
if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
|
||||
uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
|
||||
/* Append. */
|
||||
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, va); /* src address lo */
|
||||
radeon_emit(cs, va >> 32); /* src address hi */
|
||||
/* Append. */
|
||||
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, va); /* src address lo */
|
||||
radeon_emit(cs, va >> 32); /* src address hi */
|
||||
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
|
||||
t[i]->buf_filled_size,
|
||||
RADEON_USAGE_READ,
|
||||
RADEON_PRIO_SO_FILLED_SIZE);
|
||||
} else {
|
||||
/* Start from the beginning. */
|
||||
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
}
|
||||
}
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ,
|
||||
RADEON_PRIO_SO_FILLED_SIZE);
|
||||
} else {
|
||||
/* Start from the beginning. */
|
||||
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
}
|
||||
}
|
||||
|
||||
sctx->streamout.begin_emitted = true;
|
||||
sctx->streamout.begin_emitted = true;
|
||||
}
|
||||
|
||||
void si_emit_streamout_end(struct si_context *sctx)
|
||||
{
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
gfx10_emit_streamout_end(sctx);
|
||||
return;
|
||||
}
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
gfx10_emit_streamout_end(sctx);
|
||||
return;
|
||||
}
|
||||
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
unsigned i;
|
||||
uint64_t va;
|
||||
struct radeon_cmdbuf *cs = sctx->gfx_cs;
|
||||
struct si_streamout_target **t = sctx->streamout.targets;
|
||||
unsigned i;
|
||||
uint64_t va;
|
||||
|
||||
si_flush_vgt_streamout(sctx);
|
||||
si_flush_vgt_streamout(sctx);
|
||||
|
||||
for (i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
for (i = 0; i < sctx->streamout.num_targets; i++) {
|
||||
if (!t[i])
|
||||
continue;
|
||||
|
||||
va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
|
||||
STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
|
||||
STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
|
||||
radeon_emit(cs, va); /* dst address lo */
|
||||
radeon_emit(cs, va >> 32); /* dst address hi */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
|
||||
radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
|
||||
radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
|
||||
STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
|
||||
radeon_emit(cs, va); /* dst address lo */
|
||||
radeon_emit(cs, va >> 32); /* dst address hi */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
radeon_emit(cs, 0); /* unused */
|
||||
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
|
||||
t[i]->buf_filled_size,
|
||||
RADEON_USAGE_WRITE,
|
||||
RADEON_PRIO_SO_FILLED_SIZE);
|
||||
radeon_add_to_buffer_list(sctx, sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_WRITE,
|
||||
RADEON_PRIO_SO_FILLED_SIZE);
|
||||
|
||||
/* Zero the buffer size. The counters (primitives generated,
|
||||
* primitives emitted) may be enabled even if there is not
|
||||
* buffer bound. This ensures that the primitives-emitted query
|
||||
* won't increment. */
|
||||
radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
|
||||
sctx->context_roll = true;
|
||||
/* Zero the buffer size. The counters (primitives generated,
|
||||
* primitives emitted) may be enabled even if there is not
|
||||
* buffer bound. This ensures that the primitives-emitted query
|
||||
* won't increment. */
|
||||
radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
|
||||
sctx->context_roll = true;
|
||||
|
||||
t[i]->buf_filled_size_valid = true;
|
||||
}
|
||||
t[i]->buf_filled_size_valid = true;
|
||||
}
|
||||
|
||||
sctx->streamout.begin_emitted = false;
|
||||
sctx->streamout.begin_emitted = false;
|
||||
}
|
||||
|
||||
/* STREAMOUT CONFIG DERIVED STATE
|
||||
|
|
@ -423,71 +400,65 @@ void si_emit_streamout_end(struct si_context *sctx)
|
|||
|
||||
static void si_emit_streamout_enable(struct si_context *sctx)
|
||||
{
|
||||
assert(!sctx->screen->use_ngg_streamout);
|
||||
assert(!sctx->screen->use_ngg_streamout);
|
||||
|
||||
radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
|
||||
radeon_emit(sctx->gfx_cs,
|
||||
S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
|
||||
S_028B94_RAST_STREAM(0) |
|
||||
S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
|
||||
S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
|
||||
S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
|
||||
radeon_emit(sctx->gfx_cs,
|
||||
sctx->streamout.hw_enabled_mask &
|
||||
sctx->streamout.enabled_stream_buffers_mask);
|
||||
radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
|
||||
radeon_emit(sctx->gfx_cs, S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
|
||||
S_028B94_RAST_STREAM(0) |
|
||||
S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
|
||||
S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
|
||||
S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
|
||||
radeon_emit(sctx->gfx_cs,
|
||||
sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask);
|
||||
}
|
||||
|
||||
static void si_set_streamout_enable(struct si_context *sctx, bool enable)
|
||||
{
|
||||
bool old_strmout_en = si_get_strmout_en(sctx);
|
||||
unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
|
||||
bool old_strmout_en = si_get_strmout_en(sctx);
|
||||
unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
|
||||
|
||||
sctx->streamout.streamout_enabled = enable;
|
||||
sctx->streamout.streamout_enabled = enable;
|
||||
|
||||
sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask |
|
||||
(sctx->streamout.enabled_mask << 4) |
|
||||
(sctx->streamout.enabled_mask << 8) |
|
||||
(sctx->streamout.enabled_mask << 12);
|
||||
sctx->streamout.hw_enabled_mask =
|
||||
sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) |
|
||||
(sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12);
|
||||
|
||||
if (!sctx->screen->use_ngg_streamout &&
|
||||
((old_strmout_en != si_get_strmout_en(sctx)) ||
|
||||
(old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
|
||||
if (!sctx->screen->use_ngg_streamout &&
|
||||
((old_strmout_en != si_get_strmout_en(sctx)) ||
|
||||
(old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)))
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
|
||||
}
|
||||
|
||||
void si_update_prims_generated_query_state(struct si_context *sctx,
|
||||
unsigned type, int diff)
|
||||
void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff)
|
||||
{
|
||||
if (!sctx->screen->use_ngg_streamout &&
|
||||
type == PIPE_QUERY_PRIMITIVES_GENERATED) {
|
||||
bool old_strmout_en = si_get_strmout_en(sctx);
|
||||
if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) {
|
||||
bool old_strmout_en = si_get_strmout_en(sctx);
|
||||
|
||||
sctx->streamout.num_prims_gen_queries += diff;
|
||||
assert(sctx->streamout.num_prims_gen_queries >= 0);
|
||||
sctx->streamout.num_prims_gen_queries += diff;
|
||||
assert(sctx->streamout.num_prims_gen_queries >= 0);
|
||||
|
||||
sctx->streamout.prims_gen_query_enabled =
|
||||
sctx->streamout.num_prims_gen_queries != 0;
|
||||
sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0;
|
||||
|
||||
if (old_strmout_en != si_get_strmout_en(sctx))
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
|
||||
if (old_strmout_en != si_get_strmout_en(sctx))
|
||||
si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
|
||||
|
||||
if (si_update_ngg(sctx)) {
|
||||
si_shader_change_notify(sctx);
|
||||
sctx->do_update_shaders = true;
|
||||
}
|
||||
}
|
||||
if (si_update_ngg(sctx)) {
|
||||
si_shader_change_notify(sctx);
|
||||
sctx->do_update_shaders = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void si_init_streamout_functions(struct si_context *sctx)
|
||||
{
|
||||
sctx->b.create_stream_output_target = si_create_so_target;
|
||||
sctx->b.stream_output_target_destroy = si_so_target_destroy;
|
||||
sctx->b.set_stream_output_targets = si_set_streamout_targets;
|
||||
sctx->b.create_stream_output_target = si_create_so_target;
|
||||
sctx->b.stream_output_target_destroy = si_so_target_destroy;
|
||||
sctx->b.set_stream_output_targets = si_set_streamout_targets;
|
||||
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
|
||||
} else {
|
||||
sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
|
||||
sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
|
||||
}
|
||||
if (sctx->screen->use_ngg_streamout) {
|
||||
sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin;
|
||||
} else {
|
||||
sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
|
||||
sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -26,8 +26,8 @@
|
|||
/* This file implements randomized SDMA texture blit tests. */
|
||||
|
||||
#include "si_pipe.h"
|
||||
#include "util/u_surface.h"
|
||||
#include "util/rand_xor.h"
|
||||
#include "util/u_surface.h"
|
||||
|
||||
static uint64_t seed_xorshift128plus[2];
|
||||
|
||||
|
|
@ -36,382 +36,356 @@ static uint64_t seed_xorshift128plus[2];
|
|||
/* The GPU blits are emulated on the CPU using these CPU textures. */
|
||||
|
||||
struct cpu_texture {
|
||||
uint8_t *ptr;
|
||||
uint64_t size;
|
||||
uint64_t layer_stride;
|
||||
unsigned stride;
|
||||
uint8_t *ptr;
|
||||
uint64_t size;
|
||||
uint64_t layer_stride;
|
||||
unsigned stride;
|
||||
};
|
||||
|
||||
static void alloc_cpu_texture(struct cpu_texture *tex,
|
||||
struct pipe_resource *templ)
|
||||
static void alloc_cpu_texture(struct cpu_texture *tex, struct pipe_resource *templ)
|
||||
{
|
||||
tex->stride = align(util_format_get_stride(templ->format, templ->width0),
|
||||
RAND_NUM_SIZE);
|
||||
tex->layer_stride = (uint64_t)tex->stride * templ->height0;
|
||||
tex->size = tex->layer_stride * templ->array_size;
|
||||
tex->ptr = malloc(tex->size);
|
||||
assert(tex->ptr);
|
||||
tex->stride = align(util_format_get_stride(templ->format, templ->width0), RAND_NUM_SIZE);
|
||||
tex->layer_stride = (uint64_t)tex->stride * templ->height0;
|
||||
tex->size = tex->layer_stride * templ->array_size;
|
||||
tex->ptr = malloc(tex->size);
|
||||
assert(tex->ptr);
|
||||
}
|
||||
|
||||
static void set_random_pixels(struct pipe_context *ctx,
|
||||
struct pipe_resource *tex,
|
||||
struct cpu_texture *cpu)
|
||||
static void set_random_pixels(struct pipe_context *ctx, struct pipe_resource *tex,
|
||||
struct cpu_texture *cpu)
|
||||
{
|
||||
struct pipe_transfer *t;
|
||||
uint8_t *map;
|
||||
int x,y,z;
|
||||
struct pipe_transfer *t;
|
||||
uint8_t *map;
|
||||
int x, y, z;
|
||||
|
||||
map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE,
|
||||
0, 0, 0, tex->width0, tex->height0,
|
||||
tex->array_size, &t);
|
||||
assert(map);
|
||||
map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE, 0, 0, 0, tex->width0, tex->height0,
|
||||
tex->array_size, &t);
|
||||
assert(map);
|
||||
|
||||
for (z = 0; z < tex->array_size; z++) {
|
||||
for (y = 0; y < tex->height0; y++) {
|
||||
uint64_t *ptr = (uint64_t*)
|
||||
(map + t->layer_stride*z + t->stride*y);
|
||||
uint64_t *ptr_cpu = (uint64_t*)
|
||||
(cpu->ptr + cpu->layer_stride*z + cpu->stride*y);
|
||||
unsigned size = cpu->stride / RAND_NUM_SIZE;
|
||||
for (z = 0; z < tex->array_size; z++) {
|
||||
for (y = 0; y < tex->height0; y++) {
|
||||
uint64_t *ptr = (uint64_t *)(map + t->layer_stride * z + t->stride * y);
|
||||
uint64_t *ptr_cpu = (uint64_t *)(cpu->ptr + cpu->layer_stride * z + cpu->stride * y);
|
||||
unsigned size = cpu->stride / RAND_NUM_SIZE;
|
||||
|
||||
assert(t->stride % RAND_NUM_SIZE == 0);
|
||||
assert(cpu->stride % RAND_NUM_SIZE == 0);
|
||||
assert(t->stride % RAND_NUM_SIZE == 0);
|
||||
assert(cpu->stride % RAND_NUM_SIZE == 0);
|
||||
|
||||
for (x = 0; x < size; x++) {
|
||||
*ptr++ = *ptr_cpu++ =
|
||||
rand_xorshift128plus(seed_xorshift128plus);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (x = 0; x < size; x++) {
|
||||
*ptr++ = *ptr_cpu++ = rand_xorshift128plus(seed_xorshift128plus);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pipe_transfer_unmap(ctx, t);
|
||||
pipe_transfer_unmap(ctx, t);
|
||||
}
|
||||
|
||||
static bool compare_textures(struct pipe_context *ctx,
|
||||
struct pipe_resource *tex,
|
||||
struct cpu_texture *cpu)
|
||||
static bool compare_textures(struct pipe_context *ctx, struct pipe_resource *tex,
|
||||
struct cpu_texture *cpu)
|
||||
{
|
||||
struct pipe_transfer *t;
|
||||
uint8_t *map;
|
||||
int y,z;
|
||||
bool pass = true;
|
||||
unsigned stride = util_format_get_stride(tex->format, tex->width0);
|
||||
struct pipe_transfer *t;
|
||||
uint8_t *map;
|
||||
int y, z;
|
||||
bool pass = true;
|
||||
unsigned stride = util_format_get_stride(tex->format, tex->width0);
|
||||
|
||||
map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ,
|
||||
0, 0, 0, tex->width0, tex->height0,
|
||||
tex->array_size, &t);
|
||||
assert(map);
|
||||
map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_READ, 0, 0, 0, tex->width0, tex->height0,
|
||||
tex->array_size, &t);
|
||||
assert(map);
|
||||
|
||||
for (z = 0; z < tex->array_size; z++) {
|
||||
for (y = 0; y < tex->height0; y++) {
|
||||
uint8_t *ptr = map + t->layer_stride*z + t->stride*y;
|
||||
uint8_t *cpu_ptr = cpu->ptr +
|
||||
cpu->layer_stride*z + cpu->stride*y;
|
||||
for (z = 0; z < tex->array_size; z++) {
|
||||
for (y = 0; y < tex->height0; y++) {
|
||||
uint8_t *ptr = map + t->layer_stride * z + t->stride * y;
|
||||
uint8_t *cpu_ptr = cpu->ptr + cpu->layer_stride * z + cpu->stride * y;
|
||||
|
||||
if (memcmp(ptr, cpu_ptr, stride)) {
|
||||
pass = false;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (memcmp(ptr, cpu_ptr, stride)) {
|
||||
pass = false;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
}
|
||||
done:
|
||||
pipe_transfer_unmap(ctx, t);
|
||||
return pass;
|
||||
pipe_transfer_unmap(ctx, t);
|
||||
return pass;
|
||||
}
|
||||
|
||||
static enum pipe_format choose_format()
|
||||
{
|
||||
enum pipe_format formats[] = {
|
||||
PIPE_FORMAT_R8_UINT,
|
||||
PIPE_FORMAT_R16_UINT,
|
||||
PIPE_FORMAT_R32_UINT,
|
||||
PIPE_FORMAT_R32G32_UINT,
|
||||
PIPE_FORMAT_R32G32B32A32_UINT,
|
||||
PIPE_FORMAT_G8R8_B8R8_UNORM,
|
||||
};
|
||||
return formats[rand() % ARRAY_SIZE(formats)];
|
||||
enum pipe_format formats[] = {
|
||||
PIPE_FORMAT_R8_UINT, PIPE_FORMAT_R16_UINT, PIPE_FORMAT_R32_UINT,
|
||||
PIPE_FORMAT_R32G32_UINT, PIPE_FORMAT_R32G32B32A32_UINT, PIPE_FORMAT_G8R8_B8R8_UNORM,
|
||||
};
|
||||
return formats[rand() % ARRAY_SIZE(formats)];
|
||||
}
|
||||
|
||||
static const char *array_mode_to_string(struct si_screen *sscreen,
|
||||
struct radeon_surf *surf)
|
||||
static const char *array_mode_to_string(struct si_screen *sscreen, struct radeon_surf *surf)
|
||||
{
|
||||
if (sscreen->info.chip_class >= GFX9) {
|
||||
switch (surf->u.gfx9.surf.swizzle_mode) {
|
||||
case 0:
|
||||
return " LINEAR";
|
||||
case 21:
|
||||
return " 4KB_S_X";
|
||||
case 22:
|
||||
return " 4KB_D_X";
|
||||
case 25:
|
||||
return "64KB_S_X";
|
||||
case 26:
|
||||
return "64KB_D_X";
|
||||
default:
|
||||
printf("Unhandled swizzle mode = %u\n",
|
||||
surf->u.gfx9.surf.swizzle_mode);
|
||||
return " UNKNOWN";
|
||||
}
|
||||
} else {
|
||||
switch (surf->u.legacy.level[0].mode) {
|
||||
case RADEON_SURF_MODE_LINEAR_ALIGNED:
|
||||
return "LINEAR_ALIGNED";
|
||||
case RADEON_SURF_MODE_1D:
|
||||
return "1D_TILED_THIN1";
|
||||
case RADEON_SURF_MODE_2D:
|
||||
return "2D_TILED_THIN1";
|
||||
default:
|
||||
assert(0);
|
||||
return " UNKNOWN";
|
||||
}
|
||||
}
|
||||
if (sscreen->info.chip_class >= GFX9) {
|
||||
switch (surf->u.gfx9.surf.swizzle_mode) {
|
||||
case 0:
|
||||
return " LINEAR";
|
||||
case 21:
|
||||
return " 4KB_S_X";
|
||||
case 22:
|
||||
return " 4KB_D_X";
|
||||
case 25:
|
||||
return "64KB_S_X";
|
||||
case 26:
|
||||
return "64KB_D_X";
|
||||
default:
|
||||
printf("Unhandled swizzle mode = %u\n", surf->u.gfx9.surf.swizzle_mode);
|
||||
return " UNKNOWN";
|
||||
}
|
||||
} else {
|
||||
switch (surf->u.legacy.level[0].mode) {
|
||||
case RADEON_SURF_MODE_LINEAR_ALIGNED:
|
||||
return "LINEAR_ALIGNED";
|
||||
case RADEON_SURF_MODE_1D:
|
||||
return "1D_TILED_THIN1";
|
||||
case RADEON_SURF_MODE_2D:
|
||||
return "2D_TILED_THIN1";
|
||||
default:
|
||||
assert(0);
|
||||
return " UNKNOWN";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned generate_max_tex_side(unsigned max_tex_side)
|
||||
{
|
||||
switch (rand() % 4) {
|
||||
case 0:
|
||||
/* Try to hit large sizes in 1/4 of the cases. */
|
||||
return max_tex_side;
|
||||
case 1:
|
||||
/* Try to hit 1D tiling in 1/4 of the cases. */
|
||||
return 128;
|
||||
default:
|
||||
/* Try to hit common sizes in 2/4 of the cases. */
|
||||
return 2048;
|
||||
}
|
||||
switch (rand() % 4) {
|
||||
case 0:
|
||||
/* Try to hit large sizes in 1/4 of the cases. */
|
||||
return max_tex_side;
|
||||
case 1:
|
||||
/* Try to hit 1D tiling in 1/4 of the cases. */
|
||||
return 128;
|
||||
default:
|
||||
/* Try to hit common sizes in 2/4 of the cases. */
|
||||
return 2048;
|
||||
}
|
||||
}
|
||||
|
||||
void si_test_dma(struct si_screen *sscreen)
|
||||
{
|
||||
struct pipe_screen *screen = &sscreen->b;
|
||||
struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
|
||||
struct si_context *sctx = (struct si_context*)ctx;
|
||||
uint64_t max_alloc_size;
|
||||
unsigned i, iterations, num_partial_copies, max_tex_side;
|
||||
unsigned num_pass = 0, num_fail = 0;
|
||||
struct pipe_screen *screen = &sscreen->b;
|
||||
struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
uint64_t max_alloc_size;
|
||||
unsigned i, iterations, num_partial_copies, max_tex_side;
|
||||
unsigned num_pass = 0, num_fail = 0;
|
||||
|
||||
max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);
|
||||
max_tex_side = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_SIZE);
|
||||
|
||||
/* Max 128 MB allowed for both textures. */
|
||||
max_alloc_size = 128 * 1024 * 1024;
|
||||
/* Max 128 MB allowed for both textures. */
|
||||
max_alloc_size = 128 * 1024 * 1024;
|
||||
|
||||
/* the seed for random test parameters */
|
||||
srand(0x9b47d95b);
|
||||
/* the seed for random pixel data */
|
||||
s_rand_xorshift128plus(seed_xorshift128plus, false);
|
||||
/* the seed for random test parameters */
|
||||
srand(0x9b47d95b);
|
||||
/* the seed for random pixel data */
|
||||
s_rand_xorshift128plus(seed_xorshift128plus, false);
|
||||
|
||||
iterations = 1000000000; /* just kill it when you are bored */
|
||||
num_partial_copies = 30;
|
||||
iterations = 1000000000; /* just kill it when you are bored */
|
||||
num_partial_copies = 30;
|
||||
|
||||
/* These parameters are randomly generated per test:
|
||||
* - whether to do one whole-surface copy or N partial copies per test
|
||||
* - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
|
||||
* - which texture dimensions to use
|
||||
* - whether to use VRAM (all tiling modes) and GTT (staging, linear
|
||||
* only) allocations
|
||||
* - random initial pixels in src
|
||||
* - generate random subrectangle copies for partial blits
|
||||
*/
|
||||
for (i = 0; i < iterations; i++) {
|
||||
struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
|
||||
struct si_texture *sdst;
|
||||
struct si_texture *ssrc;
|
||||
struct cpu_texture src_cpu, dst_cpu;
|
||||
unsigned max_width, max_height, max_depth, j, num;
|
||||
unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
|
||||
unsigned max_tex_layers;
|
||||
bool pass;
|
||||
bool do_partial_copies = rand() & 1;
|
||||
/* These parameters are randomly generated per test:
|
||||
* - whether to do one whole-surface copy or N partial copies per test
|
||||
* - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D)
|
||||
* - which texture dimensions to use
|
||||
* - whether to use VRAM (all tiling modes) and GTT (staging, linear
|
||||
* only) allocations
|
||||
* - random initial pixels in src
|
||||
* - generate random subrectangle copies for partial blits
|
||||
*/
|
||||
for (i = 0; i < iterations; i++) {
|
||||
struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
|
||||
struct si_texture *sdst;
|
||||
struct si_texture *ssrc;
|
||||
struct cpu_texture src_cpu, dst_cpu;
|
||||
unsigned max_width, max_height, max_depth, j, num;
|
||||
unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
|
||||
unsigned max_tex_layers;
|
||||
bool pass;
|
||||
bool do_partial_copies = rand() & 1;
|
||||
|
||||
/* generate a random test case */
|
||||
tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
|
||||
tsrc.depth0 = tdst.depth0 = 1;
|
||||
/* generate a random test case */
|
||||
tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY;
|
||||
tsrc.depth0 = tdst.depth0 = 1;
|
||||
|
||||
tsrc.format = tdst.format = choose_format();
|
||||
tsrc.format = tdst.format = choose_format();
|
||||
|
||||
max_tex_side_gen = generate_max_tex_side(max_tex_side);
|
||||
max_tex_layers = rand() % 4 ? 1 : 5;
|
||||
max_tex_side_gen = generate_max_tex_side(max_tex_side);
|
||||
max_tex_layers = rand() % 4 ? 1 : 5;
|
||||
|
||||
tsrc.width0 = (rand() % max_tex_side_gen) + 1;
|
||||
tsrc.height0 = (rand() % max_tex_side_gen) + 1;
|
||||
tsrc.array_size = (rand() % max_tex_layers) + 1;
|
||||
tsrc.width0 = (rand() % max_tex_side_gen) + 1;
|
||||
tsrc.height0 = (rand() % max_tex_side_gen) + 1;
|
||||
tsrc.array_size = (rand() % max_tex_layers) + 1;
|
||||
|
||||
if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
|
||||
tsrc.width0 = align(tsrc.width0, 2);
|
||||
if (tsrc.format == PIPE_FORMAT_G8R8_B8R8_UNORM)
|
||||
tsrc.width0 = align(tsrc.width0, 2);
|
||||
|
||||
/* Have a 1/4 chance of getting power-of-two dimensions. */
|
||||
if (rand() % 4 == 0) {
|
||||
tsrc.width0 = util_next_power_of_two(tsrc.width0);
|
||||
tsrc.height0 = util_next_power_of_two(tsrc.height0);
|
||||
}
|
||||
/* Have a 1/4 chance of getting power-of-two dimensions. */
|
||||
if (rand() % 4 == 0) {
|
||||
tsrc.width0 = util_next_power_of_two(tsrc.width0);
|
||||
tsrc.height0 = util_next_power_of_two(tsrc.height0);
|
||||
}
|
||||
|
||||
if (!do_partial_copies) {
|
||||
/* whole-surface copies only, same dimensions */
|
||||
tdst = tsrc;
|
||||
} else {
|
||||
max_tex_side_gen = generate_max_tex_side(max_tex_side);
|
||||
max_tex_layers = rand() % 4 ? 1 : 5;
|
||||
if (!do_partial_copies) {
|
||||
/* whole-surface copies only, same dimensions */
|
||||
tdst = tsrc;
|
||||
} else {
|
||||
max_tex_side_gen = generate_max_tex_side(max_tex_side);
|
||||
max_tex_layers = rand() % 4 ? 1 : 5;
|
||||
|
||||
/* many partial copies, dimensions can be different */
|
||||
tdst.width0 = (rand() % max_tex_side_gen) + 1;
|
||||
tdst.height0 = (rand() % max_tex_side_gen) + 1;
|
||||
tdst.array_size = (rand() % max_tex_layers) + 1;
|
||||
/* many partial copies, dimensions can be different */
|
||||
tdst.width0 = (rand() % max_tex_side_gen) + 1;
|
||||
tdst.height0 = (rand() % max_tex_side_gen) + 1;
|
||||
tdst.array_size = (rand() % max_tex_layers) + 1;
|
||||
|
||||
/* Have a 1/4 chance of getting power-of-two dimensions. */
|
||||
if (rand() % 4 == 0) {
|
||||
tdst.width0 = util_next_power_of_two(tdst.width0);
|
||||
tdst.height0 = util_next_power_of_two(tdst.height0);
|
||||
}
|
||||
}
|
||||
/* Have a 1/4 chance of getting power-of-two dimensions. */
|
||||
if (rand() % 4 == 0) {
|
||||
tdst.width0 = util_next_power_of_two(tdst.width0);
|
||||
tdst.height0 = util_next_power_of_two(tdst.height0);
|
||||
}
|
||||
}
|
||||
|
||||
/* check texture sizes */
|
||||
if ((uint64_t) util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0)
|
||||
* tsrc.array_size * util_format_get_blocksize(tsrc.format) +
|
||||
(uint64_t) util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0)
|
||||
* tdst.array_size * util_format_get_blocksize(tdst.format) >
|
||||
max_alloc_size) {
|
||||
/* too large, try again */
|
||||
i--;
|
||||
continue;
|
||||
}
|
||||
/* check texture sizes */
|
||||
if ((uint64_t)util_format_get_nblocks(tsrc.format, tsrc.width0, tsrc.height0) *
|
||||
tsrc.array_size * util_format_get_blocksize(tsrc.format) +
|
||||
(uint64_t)util_format_get_nblocks(tdst.format, tdst.width0, tdst.height0) *
|
||||
tdst.array_size * util_format_get_blocksize(tdst.format) >
|
||||
max_alloc_size) {
|
||||
/* too large, try again */
|
||||
i--;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* VRAM + the tiling mode depends on dimensions (3/4 of cases),
|
||||
* or GTT + linear only (1/4 of cases)
|
||||
*/
|
||||
tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
|
||||
tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
|
||||
/* VRAM + the tiling mode depends on dimensions (3/4 of cases),
|
||||
* or GTT + linear only (1/4 of cases)
|
||||
*/
|
||||
tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
|
||||
tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING;
|
||||
|
||||
/* Allocate textures (both the GPU and CPU copies).
|
||||
* The CPU will emulate what the GPU should be doing.
|
||||
*/
|
||||
src = screen->resource_create(screen, &tsrc);
|
||||
dst = screen->resource_create(screen, &tdst);
|
||||
assert(src);
|
||||
assert(dst);
|
||||
sdst = (struct si_texture*)dst;
|
||||
ssrc = (struct si_texture*)src;
|
||||
alloc_cpu_texture(&src_cpu, &tsrc);
|
||||
alloc_cpu_texture(&dst_cpu, &tdst);
|
||||
/* Allocate textures (both the GPU and CPU copies).
|
||||
* The CPU will emulate what the GPU should be doing.
|
||||
*/
|
||||
src = screen->resource_create(screen, &tsrc);
|
||||
dst = screen->resource_create(screen, &tdst);
|
||||
assert(src);
|
||||
assert(dst);
|
||||
sdst = (struct si_texture *)dst;
|
||||
ssrc = (struct si_texture *)src;
|
||||
alloc_cpu_texture(&src_cpu, &tsrc);
|
||||
alloc_cpu_texture(&dst_cpu, &tdst);
|
||||
|
||||
printf("%4u: dst = (%5u x %5u x %u, %s), "
|
||||
" src = (%5u x %5u x %u, %s), format = %s, ",
|
||||
i, tdst.width0, tdst.height0, tdst.array_size,
|
||||
array_mode_to_string(sscreen, &sdst->surface),
|
||||
tsrc.width0, tsrc.height0, tsrc.array_size,
|
||||
array_mode_to_string(sscreen, &ssrc->surface),
|
||||
util_format_description(tsrc.format)->name);
|
||||
fflush(stdout);
|
||||
printf("%4u: dst = (%5u x %5u x %u, %s), "
|
||||
" src = (%5u x %5u x %u, %s), format = %s, ",
|
||||
i, tdst.width0, tdst.height0, tdst.array_size,
|
||||
array_mode_to_string(sscreen, &sdst->surface), tsrc.width0, tsrc.height0,
|
||||
tsrc.array_size, array_mode_to_string(sscreen, &ssrc->surface),
|
||||
util_format_description(tsrc.format)->name);
|
||||
fflush(stdout);
|
||||
|
||||
/* set src pixels */
|
||||
set_random_pixels(ctx, src, &src_cpu);
|
||||
/* set src pixels */
|
||||
set_random_pixels(ctx, src, &src_cpu);
|
||||
|
||||
/* clear dst pixels */
|
||||
uint32_t zero = 0;
|
||||
si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
|
||||
SI_COHERENCY_SHADER, false);
|
||||
memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
|
||||
/* clear dst pixels */
|
||||
uint32_t zero = 0;
|
||||
si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, SI_COHERENCY_SHADER, false);
|
||||
memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
|
||||
|
||||
/* preparation */
|
||||
max_width = MIN2(tsrc.width0, tdst.width0);
|
||||
max_height = MIN2(tsrc.height0, tdst.height0);
|
||||
max_depth = MIN2(tsrc.array_size, tdst.array_size);
|
||||
/* preparation */
|
||||
max_width = MIN2(tsrc.width0, tdst.width0);
|
||||
max_height = MIN2(tsrc.height0, tdst.height0);
|
||||
max_depth = MIN2(tsrc.array_size, tdst.array_size);
|
||||
|
||||
num = do_partial_copies ? num_partial_copies : 1;
|
||||
for (j = 0; j < num; j++) {
|
||||
int width, height, depth;
|
||||
int srcx, srcy, srcz, dstx, dsty, dstz;
|
||||
struct pipe_box box;
|
||||
unsigned old_num_draw_calls = sctx->num_draw_calls;
|
||||
unsigned old_num_dma_calls = sctx->num_dma_calls;
|
||||
unsigned old_num_cs_calls = sctx->num_compute_calls;
|
||||
num = do_partial_copies ? num_partial_copies : 1;
|
||||
for (j = 0; j < num; j++) {
|
||||
int width, height, depth;
|
||||
int srcx, srcy, srcz, dstx, dsty, dstz;
|
||||
struct pipe_box box;
|
||||
unsigned old_num_draw_calls = sctx->num_draw_calls;
|
||||
unsigned old_num_dma_calls = sctx->num_dma_calls;
|
||||
unsigned old_num_cs_calls = sctx->num_compute_calls;
|
||||
|
||||
if (!do_partial_copies) {
|
||||
/* copy whole src to dst */
|
||||
width = max_width;
|
||||
height = max_height;
|
||||
depth = max_depth;
|
||||
if (!do_partial_copies) {
|
||||
/* copy whole src to dst */
|
||||
width = max_width;
|
||||
height = max_height;
|
||||
depth = max_depth;
|
||||
|
||||
srcx = srcy = srcz = dstx = dsty = dstz = 0;
|
||||
} else {
|
||||
/* random sub-rectangle copies from src to dst */
|
||||
depth = (rand() % max_depth) + 1;
|
||||
srcz = rand() % (tsrc.array_size - depth + 1);
|
||||
dstz = rand() % (tdst.array_size - depth + 1);
|
||||
srcx = srcy = srcz = dstx = dsty = dstz = 0;
|
||||
} else {
|
||||
/* random sub-rectangle copies from src to dst */
|
||||
depth = (rand() % max_depth) + 1;
|
||||
srcz = rand() % (tsrc.array_size - depth + 1);
|
||||
dstz = rand() % (tdst.array_size - depth + 1);
|
||||
|
||||
/* special code path to hit the tiled partial copies */
|
||||
if (!ssrc->surface.is_linear &&
|
||||
!sdst->surface.is_linear &&
|
||||
rand() & 1) {
|
||||
if (max_width < 8 || max_height < 8)
|
||||
continue;
|
||||
width = ((rand() % (max_width / 8)) + 1) * 8;
|
||||
height = ((rand() % (max_height / 8)) + 1) * 8;
|
||||
/* special code path to hit the tiled partial copies */
|
||||
if (!ssrc->surface.is_linear && !sdst->surface.is_linear && rand() & 1) {
|
||||
if (max_width < 8 || max_height < 8)
|
||||
continue;
|
||||
width = ((rand() % (max_width / 8)) + 1) * 8;
|
||||
height = ((rand() % (max_height / 8)) + 1) * 8;
|
||||
|
||||
srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
|
||||
srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
|
||||
srcx = rand() % (tsrc.width0 - width + 1) & ~0x7;
|
||||
srcy = rand() % (tsrc.height0 - height + 1) & ~0x7;
|
||||
|
||||
dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
|
||||
dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
|
||||
} else {
|
||||
/* just make sure that it doesn't divide by zero */
|
||||
assert(max_width > 0 && max_height > 0);
|
||||
dstx = rand() % (tdst.width0 - width + 1) & ~0x7;
|
||||
dsty = rand() % (tdst.height0 - height + 1) & ~0x7;
|
||||
} else {
|
||||
/* just make sure that it doesn't divide by zero */
|
||||
assert(max_width > 0 && max_height > 0);
|
||||
|
||||
width = (rand() % max_width) + 1;
|
||||
height = (rand() % max_height) + 1;
|
||||
width = (rand() % max_width) + 1;
|
||||
height = (rand() % max_height) + 1;
|
||||
|
||||
srcx = rand() % (tsrc.width0 - width + 1);
|
||||
srcy = rand() % (tsrc.height0 - height + 1);
|
||||
srcx = rand() % (tsrc.width0 - width + 1);
|
||||
srcy = rand() % (tsrc.height0 - height + 1);
|
||||
|
||||
dstx = rand() % (tdst.width0 - width + 1);
|
||||
dsty = rand() % (tdst.height0 - height + 1);
|
||||
}
|
||||
dstx = rand() % (tdst.width0 - width + 1);
|
||||
dsty = rand() % (tdst.height0 - height + 1);
|
||||
}
|
||||
|
||||
/* special code path to hit out-of-bounds reads in L2T */
|
||||
if (ssrc->surface.is_linear &&
|
||||
!sdst->surface.is_linear &&
|
||||
rand() % 4 == 0) {
|
||||
srcx = 0;
|
||||
srcy = 0;
|
||||
srcz = 0;
|
||||
}
|
||||
}
|
||||
/* special code path to hit out-of-bounds reads in L2T */
|
||||
if (ssrc->surface.is_linear && !sdst->surface.is_linear && rand() % 4 == 0) {
|
||||
srcx = 0;
|
||||
srcy = 0;
|
||||
srcz = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* GPU copy */
|
||||
u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
|
||||
sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
|
||||
/* GPU copy */
|
||||
u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
|
||||
sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
|
||||
|
||||
/* See which engine was used. */
|
||||
gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
|
||||
dma_blits += sctx->num_dma_calls > old_num_dma_calls;
|
||||
cs_blits += sctx->num_compute_calls > old_num_cs_calls;
|
||||
/* See which engine was used. */
|
||||
gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
|
||||
dma_blits += sctx->num_dma_calls > old_num_dma_calls;
|
||||
cs_blits += sctx->num_compute_calls > old_num_cs_calls;
|
||||
|
||||
/* CPU copy */
|
||||
util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride,
|
||||
dst_cpu.layer_stride,
|
||||
dstx, dsty, dstz, width, height, depth,
|
||||
src_cpu.ptr, src_cpu.stride,
|
||||
src_cpu.layer_stride,
|
||||
srcx, srcy, srcz);
|
||||
}
|
||||
/* CPU copy */
|
||||
util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, dst_cpu.layer_stride, dstx, dsty,
|
||||
dstz, width, height, depth, src_cpu.ptr, src_cpu.stride,
|
||||
src_cpu.layer_stride, srcx, srcy, srcz);
|
||||
}
|
||||
|
||||
pass = compare_textures(ctx, dst, &dst_cpu);
|
||||
if (pass)
|
||||
num_pass++;
|
||||
else
|
||||
num_fail++;
|
||||
pass = compare_textures(ctx, dst, &dst_cpu);
|
||||
if (pass)
|
||||
num_pass++;
|
||||
else
|
||||
num_fail++;
|
||||
|
||||
printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n",
|
||||
gfx_blits, dma_blits, cs_blits, pass ? "pass" : "fail",
|
||||
num_pass, num_pass+num_fail);
|
||||
printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, dma_blits, cs_blits,
|
||||
pass ? "pass" : "fail", num_pass, num_pass + num_fail);
|
||||
|
||||
/* cleanup */
|
||||
pipe_resource_reference(&src, NULL);
|
||||
pipe_resource_reference(&dst, NULL);
|
||||
free(src_cpu.ptr);
|
||||
free(dst_cpu.ptr);
|
||||
}
|
||||
/* cleanup */
|
||||
pipe_resource_reference(&src, NULL);
|
||||
pipe_resource_reference(&dst, NULL);
|
||||
free(src_cpu.ptr);
|
||||
free(dst_cpu.ptr);
|
||||
}
|
||||
|
||||
ctx->destroy(ctx);
|
||||
exit(0);
|
||||
ctx->destroy(ctx);
|
||||
exit(0);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,451 +28,444 @@
|
|||
#include "si_pipe.h"
|
||||
#include "si_query.h"
|
||||
|
||||
#define MIN_SIZE 512
|
||||
#define MAX_SIZE (128 * 1024 * 1024)
|
||||
#define SIZE_SHIFT 1
|
||||
#define NUM_RUNS 128
|
||||
#define MIN_SIZE 512
|
||||
#define MAX_SIZE (128 * 1024 * 1024)
|
||||
#define SIZE_SHIFT 1
|
||||
#define NUM_RUNS 128
|
||||
|
||||
static double get_MBps_rate(unsigned num_bytes, unsigned ns)
|
||||
{
|
||||
return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
|
||||
return (num_bytes / (1024.0 * 1024.0)) / (ns / 1000000000.0);
|
||||
}
|
||||
|
||||
void si_test_dma_perf(struct si_screen *sscreen)
|
||||
{
|
||||
struct pipe_screen *screen = &sscreen->b;
|
||||
struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
|
||||
struct si_context *sctx = (struct si_context*)ctx;
|
||||
const uint32_t clear_value = 0x12345678;
|
||||
static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
|
||||
static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
|
||||
struct pipe_screen *screen = &sscreen->b;
|
||||
struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
const uint32_t clear_value = 0x12345678;
|
||||
static const unsigned cs_dwords_per_thread_list[] = {64, 32, 16, 8, 4, 2, 1};
|
||||
static const unsigned cs_waves_per_sh_list[] = {1, 2, 4, 8, 16, 0};
|
||||
|
||||
#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
|
||||
#define NUM_METHODS (4 + 2*NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
|
||||
#define NUM_METHODS (4 + 2 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
|
||||
|
||||
static const char *method_str[] = {
|
||||
"CP MC ",
|
||||
"CP L2 ",
|
||||
"CP L2 ",
|
||||
"SDMA ",
|
||||
};
|
||||
static const char *placement_str[] = {
|
||||
/* Clear */
|
||||
"fill->VRAM",
|
||||
"fill->GTT ",
|
||||
/* Copy */
|
||||
"VRAM->VRAM",
|
||||
"VRAM->GTT ",
|
||||
"GTT ->VRAM",
|
||||
};
|
||||
static const char *method_str[] = {
|
||||
"CP MC ",
|
||||
"CP L2 ",
|
||||
"CP L2 ",
|
||||
"SDMA ",
|
||||
};
|
||||
static const char *placement_str[] = {
|
||||
/* Clear */
|
||||
"fill->VRAM",
|
||||
"fill->GTT ",
|
||||
/* Copy */
|
||||
"VRAM->VRAM",
|
||||
"VRAM->GTT ",
|
||||
"GTT ->VRAM",
|
||||
};
|
||||
|
||||
printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
|
||||
printf("Heap ,Method ,L2p,Wa,");
|
||||
for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
|
||||
if (size >= 1024)
|
||||
printf("%6uKB,", size / 1024);
|
||||
else
|
||||
printf(" %6uB,", size);
|
||||
}
|
||||
printf("\n");
|
||||
printf("DMA rate is in MB/s for each size. Slow cases are skipped and print 0.\n");
|
||||
printf("Heap ,Method ,L2p,Wa,");
|
||||
for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
|
||||
if (size >= 1024)
|
||||
printf("%6uKB,", size / 1024);
|
||||
else
|
||||
printf(" %6uB,", size);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
/* results[log2(size)][placement][method][] */
|
||||
struct si_result {
|
||||
bool is_valid;
|
||||
bool is_cp;
|
||||
bool is_sdma;
|
||||
bool is_cs;
|
||||
unsigned cache_policy;
|
||||
unsigned dwords_per_thread;
|
||||
unsigned waves_per_sh;
|
||||
unsigned score;
|
||||
unsigned index; /* index in results[x][y][index] */
|
||||
} results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
|
||||
/* results[log2(size)][placement][method][] */
|
||||
struct si_result {
|
||||
bool is_valid;
|
||||
bool is_cp;
|
||||
bool is_sdma;
|
||||
bool is_cs;
|
||||
unsigned cache_policy;
|
||||
unsigned dwords_per_thread;
|
||||
unsigned waves_per_sh;
|
||||
unsigned score;
|
||||
unsigned index; /* index in results[x][y][index] */
|
||||
} results[32][ARRAY_SIZE(placement_str)][NUM_METHODS] = {};
|
||||
|
||||
/* Run benchmarks. */
|
||||
for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
|
||||
bool is_copy = placement >= 2;
|
||||
/* Run benchmarks. */
|
||||
for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
|
||||
bool is_copy = placement >= 2;
|
||||
|
||||
printf("-----------,--------,---,--,");
|
||||
for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
|
||||
printf("--------,");
|
||||
printf("\n");
|
||||
printf("-----------,--------,---,--,");
|
||||
for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT)
|
||||
printf("--------,");
|
||||
printf("\n");
|
||||
|
||||
for (unsigned method = 0; method < NUM_METHODS; method++) {
|
||||
bool test_cp = method <= 2;
|
||||
bool test_sdma = method == 3;
|
||||
bool test_cs = method >= 4;
|
||||
unsigned cs_method = method - 4;
|
||||
STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
|
||||
unsigned cs_waves_per_sh =
|
||||
test_cs ? cs_waves_per_sh_list[cs_method / (2*NUM_SHADERS)] : 0;
|
||||
cs_method %= 2*NUM_SHADERS;
|
||||
unsigned cache_policy = test_cp ? method % 3 :
|
||||
test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
|
||||
unsigned cs_dwords_per_thread =
|
||||
test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
|
||||
for (unsigned method = 0; method < NUM_METHODS; method++) {
|
||||
bool test_cp = method <= 2;
|
||||
bool test_sdma = method == 3;
|
||||
bool test_cs = method >= 4;
|
||||
unsigned cs_method = method - 4;
|
||||
STATIC_ASSERT(L2_STREAM + 1 == L2_LRU);
|
||||
unsigned cs_waves_per_sh =
|
||||
test_cs ? cs_waves_per_sh_list[cs_method / (2 * NUM_SHADERS)] : 0;
|
||||
cs_method %= 2 * NUM_SHADERS;
|
||||
unsigned cache_policy =
|
||||
test_cp ? method % 3 : test_cs ? L2_STREAM + (cs_method / NUM_SHADERS) : 0;
|
||||
unsigned cs_dwords_per_thread =
|
||||
test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
|
||||
|
||||
if (test_sdma && !sctx->sdma_cs)
|
||||
continue;
|
||||
if (test_sdma && !sctx->sdma_cs)
|
||||
continue;
|
||||
|
||||
if (sctx->chip_class == GFX6) {
|
||||
/* GFX6 doesn't support CP DMA operations through L2. */
|
||||
if (test_cp && cache_policy != L2_BYPASS)
|
||||
continue;
|
||||
/* WAVES_PER_SH is in multiples of 16 on GFX6. */
|
||||
if (test_cs && cs_waves_per_sh % 16 != 0)
|
||||
continue;
|
||||
}
|
||||
if (sctx->chip_class == GFX6) {
|
||||
/* GFX6 doesn't support CP DMA operations through L2. */
|
||||
if (test_cp && cache_policy != L2_BYPASS)
|
||||
continue;
|
||||
/* WAVES_PER_SH is in multiples of 16 on GFX6. */
|
||||
if (test_cs && cs_waves_per_sh % 16 != 0)
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("%s ,", placement_str[placement]);
|
||||
if (test_cs) {
|
||||
printf("CS x%-4u,%3s,", cs_dwords_per_thread,
|
||||
cache_policy == L2_LRU ? "LRU" :
|
||||
cache_policy == L2_STREAM ? "Str" : "");
|
||||
} else {
|
||||
printf("%s,%3s,", method_str[method],
|
||||
method == L2_LRU ? "LRU" :
|
||||
method == L2_STREAM ? "Str" : "");
|
||||
}
|
||||
if (test_cs && cs_waves_per_sh)
|
||||
printf("%2u,", cs_waves_per_sh);
|
||||
else
|
||||
printf(" ,");
|
||||
printf("%s ,", placement_str[placement]);
|
||||
if (test_cs) {
|
||||
printf("CS x%-4u,%3s,", cs_dwords_per_thread,
|
||||
cache_policy == L2_LRU ? "LRU" : cache_policy == L2_STREAM ? "Str" : "");
|
||||
} else {
|
||||
printf("%s,%3s,", method_str[method],
|
||||
method == L2_LRU ? "LRU" : method == L2_STREAM ? "Str" : "");
|
||||
}
|
||||
if (test_cs && cs_waves_per_sh)
|
||||
printf("%2u,", cs_waves_per_sh);
|
||||
else
|
||||
printf(" ,");
|
||||
|
||||
double score = 0;
|
||||
for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
|
||||
/* Don't test bigger sizes if it's too slow. Print 0. */
|
||||
if (size >= 512*1024 &&
|
||||
score < 400 * (size / (4*1024*1024))) {
|
||||
printf("%7.0f ,", 0.0);
|
||||
continue;
|
||||
}
|
||||
double score = 0;
|
||||
for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
|
||||
/* Don't test bigger sizes if it's too slow. Print 0. */
|
||||
if (size >= 512 * 1024 && score < 400 * (size / (4 * 1024 * 1024))) {
|
||||
printf("%7.0f ,", 0.0);
|
||||
continue;
|
||||
}
|
||||
|
||||
enum pipe_resource_usage dst_usage, src_usage;
|
||||
struct pipe_resource *dst, *src;
|
||||
struct pipe_query *q[NUM_RUNS];
|
||||
unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
|
||||
enum pipe_resource_usage dst_usage, src_usage;
|
||||
struct pipe_resource *dst, *src;
|
||||
struct pipe_query *q[NUM_RUNS];
|
||||
unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
|
||||
|
||||
if (test_sdma) {
|
||||
if (sctx->chip_class == GFX6)
|
||||
query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
|
||||
else
|
||||
query_type = SI_QUERY_TIME_ELAPSED_SDMA;
|
||||
}
|
||||
if (test_sdma) {
|
||||
if (sctx->chip_class == GFX6)
|
||||
query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
|
||||
else
|
||||
query_type = SI_QUERY_TIME_ELAPSED_SDMA;
|
||||
}
|
||||
|
||||
if (placement == 0 || placement == 2 || placement == 4)
|
||||
dst_usage = PIPE_USAGE_DEFAULT;
|
||||
else
|
||||
dst_usage = PIPE_USAGE_STREAM;
|
||||
if (placement == 0 || placement == 2 || placement == 4)
|
||||
dst_usage = PIPE_USAGE_DEFAULT;
|
||||
else
|
||||
dst_usage = PIPE_USAGE_STREAM;
|
||||
|
||||
if (placement == 2 || placement == 3)
|
||||
src_usage = PIPE_USAGE_DEFAULT;
|
||||
else
|
||||
src_usage = PIPE_USAGE_STREAM;
|
||||
if (placement == 2 || placement == 3)
|
||||
src_usage = PIPE_USAGE_DEFAULT;
|
||||
else
|
||||
src_usage = PIPE_USAGE_STREAM;
|
||||
|
||||
dst = pipe_buffer_create(screen, 0, dst_usage, size);
|
||||
src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
|
||||
dst = pipe_buffer_create(screen, 0, dst_usage, size);
|
||||
src = is_copy ? pipe_buffer_create(screen, 0, src_usage, size) : NULL;
|
||||
|
||||
/* Run tests. */
|
||||
for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
|
||||
q[iter] = ctx->create_query(ctx, query_type, 0);
|
||||
ctx->begin_query(ctx, q[iter]);
|
||||
/* Run tests. */
|
||||
for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
|
||||
q[iter] = ctx->create_query(ctx, query_type, 0);
|
||||
ctx->begin_query(ctx, q[iter]);
|
||||
|
||||
if (test_cp) {
|
||||
/* CP DMA */
|
||||
if (is_copy) {
|
||||
si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
|
||||
SI_COHERENCY_NONE, cache_policy);
|
||||
} else {
|
||||
si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size,
|
||||
clear_value, 0,
|
||||
SI_COHERENCY_NONE, cache_policy);
|
||||
}
|
||||
} else if (test_sdma) {
|
||||
/* SDMA */
|
||||
if (is_copy) {
|
||||
si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
|
||||
} else {
|
||||
si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
|
||||
}
|
||||
} else {
|
||||
/* Compute */
|
||||
/* The memory accesses are coalesced, meaning that the 1st instruction writes
|
||||
* the 1st contiguous block of data for the whole wave, the 2nd instruction
|
||||
* writes the 2nd contiguous block of data, etc.
|
||||
*/
|
||||
unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
|
||||
unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
|
||||
unsigned dwords_per_wave = cs_dwords_per_thread * 64;
|
||||
if (test_cp) {
|
||||
/* CP DMA */
|
||||
if (is_copy) {
|
||||
si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, SI_COHERENCY_NONE,
|
||||
cache_policy);
|
||||
} else {
|
||||
si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, clear_value, 0,
|
||||
SI_COHERENCY_NONE, cache_policy);
|
||||
}
|
||||
} else if (test_sdma) {
|
||||
/* SDMA */
|
||||
if (is_copy) {
|
||||
si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
|
||||
} else {
|
||||
si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
|
||||
}
|
||||
} else {
|
||||
/* Compute */
|
||||
/* The memory accesses are coalesced, meaning that the 1st instruction writes
|
||||
* the 1st contiguous block of data for the whole wave, the 2nd instruction
|
||||
* writes the 2nd contiguous block of data, etc.
|
||||
*/
|
||||
unsigned instructions_per_thread = MAX2(1, cs_dwords_per_thread / 4);
|
||||
unsigned dwords_per_instruction = cs_dwords_per_thread / instructions_per_thread;
|
||||
unsigned dwords_per_wave = cs_dwords_per_thread * 64;
|
||||
|
||||
unsigned num_dwords = size / 4;
|
||||
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
|
||||
unsigned num_dwords = size / 4;
|
||||
unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
|
||||
|
||||
void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
|
||||
cache_policy == L2_STREAM, is_copy);
|
||||
void *cs = si_create_dma_compute_shader(ctx, cs_dwords_per_thread,
|
||||
cache_policy == L2_STREAM, is_copy);
|
||||
|
||||
struct pipe_grid_info info = {};
|
||||
info.block[0] = MIN2(64, num_instructions);
|
||||
info.block[1] = 1;
|
||||
info.block[2] = 1;
|
||||
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
|
||||
info.grid[1] = 1;
|
||||
info.grid[2] = 1;
|
||||
struct pipe_grid_info info = {};
|
||||
info.block[0] = MIN2(64, num_instructions);
|
||||
info.block[1] = 1;
|
||||
info.block[2] = 1;
|
||||
info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
|
||||
info.grid[1] = 1;
|
||||
info.grid[2] = 1;
|
||||
|
||||
struct pipe_shader_buffer sb[2] = {};
|
||||
sb[0].buffer = dst;
|
||||
sb[0].buffer_size = size;
|
||||
struct pipe_shader_buffer sb[2] = {};
|
||||
sb[0].buffer = dst;
|
||||
sb[0].buffer_size = size;
|
||||
|
||||
if (is_copy) {
|
||||
sb[1].buffer = src;
|
||||
sb[1].buffer_size = size;
|
||||
} else {
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
sctx->cs_user_data[i] = clear_value;
|
||||
}
|
||||
if (is_copy) {
|
||||
sb[1].buffer = src;
|
||||
sb[1].buffer_size = size;
|
||||
} else {
|
||||
for (unsigned i = 0; i < 4; i++)
|
||||
sctx->cs_user_data[i] = clear_value;
|
||||
}
|
||||
|
||||
sctx->flags |= SI_CONTEXT_INV_VCACHE |
|
||||
SI_CONTEXT_INV_SCACHE;
|
||||
sctx->flags |= SI_CONTEXT_INV_VCACHE | SI_CONTEXT_INV_SCACHE;
|
||||
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0,
|
||||
is_copy ? 2 : 1, sb, 0x1);
|
||||
ctx->bind_compute_state(ctx, cs);
|
||||
sctx->cs_max_waves_per_sh = cs_waves_per_sh;
|
||||
ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, is_copy ? 2 : 1, sb, 0x1);
|
||||
ctx->bind_compute_state(ctx, cs);
|
||||
sctx->cs_max_waves_per_sh = cs_waves_per_sh;
|
||||
|
||||
ctx->launch_grid(ctx, &info);
|
||||
ctx->launch_grid(ctx, &info);
|
||||
|
||||
ctx->bind_compute_state(ctx, NULL);
|
||||
ctx->delete_compute_state(ctx, cs);
|
||||
sctx->cs_max_waves_per_sh = 0; /* disable the limit */
|
||||
ctx->bind_compute_state(ctx, NULL);
|
||||
ctx->delete_compute_state(ctx, cs);
|
||||
sctx->cs_max_waves_per_sh = 0; /* disable the limit */
|
||||
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
|
||||
}
|
||||
|
||||
/* Flush L2, so that we don't just test L2 cache performance. */
|
||||
if (!test_sdma) {
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
sctx->emit_cache_flush(sctx);
|
||||
}
|
||||
/* Flush L2, so that we don't just test L2 cache performance. */
|
||||
if (!test_sdma) {
|
||||
sctx->flags |= SI_CONTEXT_WB_L2;
|
||||
sctx->emit_cache_flush(sctx);
|
||||
}
|
||||
|
||||
ctx->end_query(ctx, q[iter]);
|
||||
ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
|
||||
}
|
||||
pipe_resource_reference(&dst, NULL);
|
||||
pipe_resource_reference(&src, NULL);
|
||||
ctx->end_query(ctx, q[iter]);
|
||||
ctx->flush(ctx, NULL, PIPE_FLUSH_ASYNC);
|
||||
}
|
||||
pipe_resource_reference(&dst, NULL);
|
||||
pipe_resource_reference(&src, NULL);
|
||||
|
||||
/* Get results. */
|
||||
uint64_t min = ~0ull, max = 0, total = 0;
|
||||
/* Get results. */
|
||||
uint64_t min = ~0ull, max = 0, total = 0;
|
||||
|
||||
for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
|
||||
union pipe_query_result result;
|
||||
for (unsigned iter = 0; iter < NUM_RUNS; iter++) {
|
||||
union pipe_query_result result;
|
||||
|
||||
ctx->get_query_result(ctx, q[iter], true, &result);
|
||||
ctx->destroy_query(ctx, q[iter]);
|
||||
ctx->get_query_result(ctx, q[iter], true, &result);
|
||||
ctx->destroy_query(ctx, q[iter]);
|
||||
|
||||
min = MIN2(min, result.u64);
|
||||
max = MAX2(max, result.u64);
|
||||
total += result.u64;
|
||||
}
|
||||
min = MIN2(min, result.u64);
|
||||
max = MAX2(max, result.u64);
|
||||
total += result.u64;
|
||||
}
|
||||
|
||||
score = get_MBps_rate(size, total / (double)NUM_RUNS);
|
||||
printf("%7.0f ,", score);
|
||||
fflush(stdout);
|
||||
score = get_MBps_rate(size, total / (double)NUM_RUNS);
|
||||
printf("%7.0f ,", score);
|
||||
fflush(stdout);
|
||||
|
||||
struct si_result *r = &results[util_logbase2(size)][placement][method];
|
||||
r->is_valid = true;
|
||||
r->is_cp = test_cp;
|
||||
r->is_sdma = test_sdma;
|
||||
r->is_cs = test_cs;
|
||||
r->cache_policy = cache_policy;
|
||||
r->dwords_per_thread = cs_dwords_per_thread;
|
||||
r->waves_per_sh = cs_waves_per_sh;
|
||||
r->score = score;
|
||||
r->index = method;
|
||||
}
|
||||
puts("");
|
||||
}
|
||||
}
|
||||
struct si_result *r = &results[util_logbase2(size)][placement][method];
|
||||
r->is_valid = true;
|
||||
r->is_cp = test_cp;
|
||||
r->is_sdma = test_sdma;
|
||||
r->is_cs = test_cs;
|
||||
r->cache_policy = cache_policy;
|
||||
r->dwords_per_thread = cs_dwords_per_thread;
|
||||
r->waves_per_sh = cs_waves_per_sh;
|
||||
r->score = score;
|
||||
r->index = method;
|
||||
}
|
||||
puts("");
|
||||
}
|
||||
}
|
||||
|
||||
puts("");
|
||||
puts("static struct si_method");
|
||||
printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool cached)\n",
|
||||
sctx->screen->info.name);
|
||||
puts("{");
|
||||
puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
|
||||
puts("");
|
||||
puts("static struct si_method");
|
||||
printf("get_best_clear_for_%s(enum radeon_bo_domain dst, uint64_t size64, bool async, bool "
|
||||
"cached)\n",
|
||||
sctx->screen->info.name);
|
||||
puts("{");
|
||||
puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
|
||||
|
||||
/* Analyze results and find the best methods. */
|
||||
for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
|
||||
if (placement == 0)
|
||||
puts(" if (dst == RADEON_DOMAIN_VRAM) {");
|
||||
else if (placement == 1)
|
||||
puts(" } else { /* GTT */");
|
||||
else if (placement == 2) {
|
||||
puts("}");
|
||||
puts("");
|
||||
puts("static struct si_method");
|
||||
printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
|
||||
sctx->screen->info.name);
|
||||
printf(" uint64_t size64, bool async, bool cached)\n");
|
||||
puts("{");
|
||||
puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
|
||||
puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
|
||||
} else if (placement == 3)
|
||||
puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
|
||||
else
|
||||
puts(" } else { /* GTT -> VRAM */");
|
||||
/* Analyze results and find the best methods. */
|
||||
for (unsigned placement = 0; placement < ARRAY_SIZE(placement_str); placement++) {
|
||||
if (placement == 0)
|
||||
puts(" if (dst == RADEON_DOMAIN_VRAM) {");
|
||||
else if (placement == 1)
|
||||
puts(" } else { /* GTT */");
|
||||
else if (placement == 2) {
|
||||
puts("}");
|
||||
puts("");
|
||||
puts("static struct si_method");
|
||||
printf("get_best_copy_for_%s(enum radeon_bo_domain dst, enum radeon_bo_domain src,\n",
|
||||
sctx->screen->info.name);
|
||||
printf(" uint64_t size64, bool async, bool cached)\n");
|
||||
puts("{");
|
||||
puts(" unsigned size = MIN2(size64, UINT_MAX);\n");
|
||||
puts(" if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_VRAM) {");
|
||||
} else if (placement == 3)
|
||||
puts(" } else if (src == RADEON_DOMAIN_VRAM && dst == RADEON_DOMAIN_GTT) {");
|
||||
else
|
||||
puts(" } else { /* GTT -> VRAM */");
|
||||
|
||||
for (unsigned mode = 0; mode < 3; mode++) {
|
||||
bool async = mode == 0;
|
||||
bool cached = mode == 1;
|
||||
for (unsigned mode = 0; mode < 3; mode++) {
|
||||
bool async = mode == 0;
|
||||
bool cached = mode == 1;
|
||||
|
||||
if (async)
|
||||
puts(" if (async) { /* SDMA or async compute */");
|
||||
else if (cached)
|
||||
puts(" if (cached) { /* gfx ring */");
|
||||
else
|
||||
puts(" } else { /* gfx ring - uncached */");
|
||||
if (async)
|
||||
puts(" if (async) { /* SDMA or async compute */");
|
||||
else if (cached)
|
||||
puts(" if (cached) { /* gfx ring */");
|
||||
else
|
||||
puts(" } else { /* gfx ring - uncached */");
|
||||
|
||||
/* The list of best chosen methods. */
|
||||
struct si_result *methods[32];
|
||||
unsigned method_max_size[32];
|
||||
unsigned num_methods = 0;
|
||||
/* The list of best chosen methods. */
|
||||
struct si_result *methods[32];
|
||||
unsigned method_max_size[32];
|
||||
unsigned num_methods = 0;
|
||||
|
||||
for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
|
||||
/* Find the best method. */
|
||||
struct si_result *best = NULL;
|
||||
for (unsigned size = MIN_SIZE; size <= MAX_SIZE; size <<= SIZE_SHIFT) {
|
||||
/* Find the best method. */
|
||||
struct si_result *best = NULL;
|
||||
|
||||
for (unsigned i = 0; i < NUM_METHODS; i++) {
|
||||
struct si_result *r = &results[util_logbase2(size)][placement][i];
|
||||
for (unsigned i = 0; i < NUM_METHODS; i++) {
|
||||
struct si_result *r = &results[util_logbase2(size)][placement][i];
|
||||
|
||||
if (!r->is_valid)
|
||||
continue;
|
||||
if (!r->is_valid)
|
||||
continue;
|
||||
|
||||
/* Ban CP DMA clears via MC on <= GFX8. They are super slow
|
||||
* on GTT, which we can get due to BO evictions.
|
||||
*/
|
||||
if (sctx->chip_class <= GFX8 && placement == 1 &&
|
||||
r->is_cp && r->cache_policy == L2_BYPASS)
|
||||
continue;
|
||||
/* Ban CP DMA clears via MC on <= GFX8. They are super slow
|
||||
* on GTT, which we can get due to BO evictions.
|
||||
*/
|
||||
if (sctx->chip_class <= GFX8 && placement == 1 && r->is_cp &&
|
||||
r->cache_policy == L2_BYPASS)
|
||||
continue;
|
||||
|
||||
if (async) {
|
||||
/* The following constraints for compute IBs try to limit
|
||||
* resource usage so as not to decrease the performance
|
||||
* of gfx IBs too much.
|
||||
*/
|
||||
if (async) {
|
||||
/* The following constraints for compute IBs try to limit
|
||||
* resource usage so as not to decrease the performance
|
||||
* of gfx IBs too much.
|
||||
*/
|
||||
|
||||
/* Don't use CP DMA on asynchronous rings, because
|
||||
* the engine is shared with gfx IBs.
|
||||
*/
|
||||
if (r->is_cp)
|
||||
continue;
|
||||
/* Don't use CP DMA on asynchronous rings, because
|
||||
* the engine is shared with gfx IBs.
|
||||
*/
|
||||
if (r->is_cp)
|
||||
continue;
|
||||
|
||||
/* Don't use L2 caching on asynchronous rings to minimize
|
||||
* L2 usage.
|
||||
*/
|
||||
if (r->cache_policy == L2_LRU)
|
||||
continue;
|
||||
/* Don't use L2 caching on asynchronous rings to minimize
|
||||
* L2 usage.
|
||||
*/
|
||||
if (r->cache_policy == L2_LRU)
|
||||
continue;
|
||||
|
||||
/* Asynchronous compute recommends waves_per_sh != 0
|
||||
* to limit CU usage. */
|
||||
if (r->is_cs && r->waves_per_sh == 0)
|
||||
continue;
|
||||
} else {
|
||||
/* SDMA is always asynchronous */
|
||||
if (r->is_sdma)
|
||||
continue;
|
||||
/* Asynchronous compute recommends waves_per_sh != 0
|
||||
* to limit CU usage. */
|
||||
if (r->is_cs && r->waves_per_sh == 0)
|
||||
continue;
|
||||
} else {
|
||||
/* SDMA is always asynchronous */
|
||||
if (r->is_sdma)
|
||||
continue;
|
||||
|
||||
if (cached && r->cache_policy == L2_BYPASS)
|
||||
continue;
|
||||
if (!cached && r->cache_policy == L2_LRU)
|
||||
continue;
|
||||
}
|
||||
if (cached && r->cache_policy == L2_BYPASS)
|
||||
continue;
|
||||
if (!cached && r->cache_policy == L2_LRU)
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!best) {
|
||||
best = r;
|
||||
continue;
|
||||
}
|
||||
if (!best) {
|
||||
best = r;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Assume some measurement error. Earlier methods occupy fewer
|
||||
* resources, so the next method is always more greedy, and we
|
||||
* don't want to select it due to a measurement error.
|
||||
*/
|
||||
double min_improvement = 1.03;
|
||||
/* Assume some measurement error. Earlier methods occupy fewer
|
||||
* resources, so the next method is always more greedy, and we
|
||||
* don't want to select it due to a measurement error.
|
||||
*/
|
||||
double min_improvement = 1.03;
|
||||
|
||||
if (best->score * min_improvement < r->score)
|
||||
best = r;
|
||||
}
|
||||
if (best->score * min_improvement < r->score)
|
||||
best = r;
|
||||
}
|
||||
|
||||
if (num_methods > 0) {
|
||||
unsigned prev_index = num_methods - 1;
|
||||
struct si_result *prev = methods[prev_index];
|
||||
struct si_result *prev_this_size = &results[util_logbase2(size)][placement][prev->index];
|
||||
if (num_methods > 0) {
|
||||
unsigned prev_index = num_methods - 1;
|
||||
struct si_result *prev = methods[prev_index];
|
||||
struct si_result *prev_this_size =
|
||||
&results[util_logbase2(size)][placement][prev->index];
|
||||
|
||||
/* If the best one is also the best for the previous size,
|
||||
* just bump the size for the previous one.
|
||||
*
|
||||
* If there is no best, it means all methods were too slow
|
||||
* for this size and were not tested. Use the best one for
|
||||
* the previous size.
|
||||
*/
|
||||
if (!best ||
|
||||
/* If it's the same method as for the previous size: */
|
||||
(prev->is_cp == best->is_cp &&
|
||||
prev->is_sdma == best->is_sdma &&
|
||||
prev->is_cs == best->is_cs &&
|
||||
prev->cache_policy == best->cache_policy &&
|
||||
prev->dwords_per_thread == best->dwords_per_thread &&
|
||||
prev->waves_per_sh == best->waves_per_sh) ||
|
||||
/* If the method for the previous size is also the best
|
||||
* for this size: */
|
||||
(prev_this_size->is_valid &&
|
||||
prev_this_size->score * 1.03 > best->score)) {
|
||||
method_max_size[prev_index] = size;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
/* If the best one is also the best for the previous size,
|
||||
* just bump the size for the previous one.
|
||||
*
|
||||
* If there is no best, it means all methods were too slow
|
||||
* for this size and were not tested. Use the best one for
|
||||
* the previous size.
|
||||
*/
|
||||
if (!best ||
|
||||
/* If it's the same method as for the previous size: */
|
||||
(prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
|
||||
prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
|
||||
prev->dwords_per_thread == best->dwords_per_thread &&
|
||||
prev->waves_per_sh == best->waves_per_sh) ||
|
||||
/* If the method for the previous size is also the best
|
||||
* for this size: */
|
||||
(prev_this_size->is_valid && prev_this_size->score * 1.03 > best->score)) {
|
||||
method_max_size[prev_index] = size;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Add it to the list. */
|
||||
assert(num_methods < ARRAY_SIZE(methods));
|
||||
methods[num_methods] = best;
|
||||
method_max_size[num_methods] = size;
|
||||
num_methods++;
|
||||
}
|
||||
/* Add it to the list. */
|
||||
assert(num_methods < ARRAY_SIZE(methods));
|
||||
methods[num_methods] = best;
|
||||
method_max_size[num_methods] = size;
|
||||
num_methods++;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < num_methods; i++) {
|
||||
struct si_result *best = methods[i];
|
||||
unsigned size = method_max_size[i];
|
||||
for (unsigned i = 0; i < num_methods; i++) {
|
||||
struct si_result *best = methods[i];
|
||||
unsigned size = method_max_size[i];
|
||||
|
||||
/* The size threshold is between the current benchmarked
|
||||
* size and the next benchmarked size. */
|
||||
if (i < num_methods - 1)
|
||||
printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
|
||||
else if (i > 0)
|
||||
printf(" else ");
|
||||
else
|
||||
printf(" ");
|
||||
printf("return ");
|
||||
/* The size threshold is between the current benchmarked
|
||||
* size and the next benchmarked size. */
|
||||
if (i < num_methods - 1)
|
||||
printf(" if (size <= %9u) ", (size + (size << SIZE_SHIFT)) / 2);
|
||||
else if (i > 0)
|
||||
printf(" else ");
|
||||
else
|
||||
printf(" ");
|
||||
printf("return ");
|
||||
|
||||
assert(best);
|
||||
if (best->is_cp) {
|
||||
printf("CP_DMA(%s);\n",
|
||||
best->cache_policy == L2_BYPASS ? "L2_BYPASS" :
|
||||
best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM");
|
||||
}
|
||||
if (best->is_sdma)
|
||||
printf("SDMA;\n");
|
||||
if (best->is_cs) {
|
||||
printf("COMPUTE(%s, %u, %u);\n",
|
||||
best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM",
|
||||
best->dwords_per_thread,
|
||||
best->waves_per_sh);
|
||||
}
|
||||
}
|
||||
}
|
||||
puts(" }");
|
||||
}
|
||||
puts(" }");
|
||||
puts("}");
|
||||
assert(best);
|
||||
if (best->is_cp) {
|
||||
printf("CP_DMA(%s);\n",
|
||||
best->cache_policy == L2_BYPASS
|
||||
? "L2_BYPASS"
|
||||
: best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM");
|
||||
}
|
||||
if (best->is_sdma)
|
||||
printf("SDMA;\n");
|
||||
if (best->is_cs) {
|
||||
printf("COMPUTE(%s, %u, %u);\n",
|
||||
best->cache_policy == L2_LRU ? "L2_LRU " : "L2_STREAM",
|
||||
best->dwords_per_thread, best->waves_per_sh);
|
||||
}
|
||||
}
|
||||
}
|
||||
puts(" }");
|
||||
}
|
||||
puts(" }");
|
||||
puts("}");
|
||||
|
||||
ctx->destroy(ctx);
|
||||
exit(0);
|
||||
ctx->destroy(ctx);
|
||||
exit(0);
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -25,79 +25,77 @@
|
|||
*
|
||||
**************************************************************************/
|
||||
|
||||
#include "si_pipe.h"
|
||||
#include "radeon/radeon_video.h"
|
||||
#include "radeon/radeon_uvd.h"
|
||||
#include "radeon/radeon_uvd_enc.h"
|
||||
#include "radeon/radeon_vce.h"
|
||||
#include "radeon/radeon_vcn_dec.h"
|
||||
#include "radeon/radeon_vcn_enc.h"
|
||||
#include "radeon/radeon_uvd_enc.h"
|
||||
#include "radeon/radeon_video.h"
|
||||
#include "si_pipe.h"
|
||||
#include "util/u_video.h"
|
||||
|
||||
/**
|
||||
* creates an video buffer with an UVD compatible memory layout
|
||||
*/
|
||||
struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
|
||||
const struct pipe_video_buffer *tmpl)
|
||||
const struct pipe_video_buffer *tmpl)
|
||||
{
|
||||
struct pipe_video_buffer vidbuf = *tmpl;
|
||||
/* TODO: get tiling working */
|
||||
vidbuf.bind |= PIPE_BIND_LINEAR;
|
||||
struct pipe_video_buffer vidbuf = *tmpl;
|
||||
/* TODO: get tiling working */
|
||||
vidbuf.bind |= PIPE_BIND_LINEAR;
|
||||
|
||||
return vl_video_buffer_create_as_resource(pipe, &vidbuf);
|
||||
return vl_video_buffer_create_as_resource(pipe, &vidbuf);
|
||||
}
|
||||
|
||||
/* set the decoding target buffer offsets */
|
||||
static struct pb_buffer* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
|
||||
static struct pb_buffer *si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
|
||||
{
|
||||
struct si_screen *sscreen = (struct si_screen*)buf->base.context->screen;
|
||||
struct si_texture *luma = (struct si_texture *)buf->resources[0];
|
||||
struct si_texture *chroma = (struct si_texture *)buf->resources[1];
|
||||
enum ruvd_surface_type type = (sscreen->info.chip_class >= GFX9) ?
|
||||
RUVD_SURFACE_TYPE_GFX9 :
|
||||
RUVD_SURFACE_TYPE_LEGACY;
|
||||
struct si_screen *sscreen = (struct si_screen *)buf->base.context->screen;
|
||||
struct si_texture *luma = (struct si_texture *)buf->resources[0];
|
||||
struct si_texture *chroma = (struct si_texture *)buf->resources[1];
|
||||
enum ruvd_surface_type type =
|
||||
(sscreen->info.chip_class >= GFX9) ? RUVD_SURFACE_TYPE_GFX9 : RUVD_SURFACE_TYPE_LEGACY;
|
||||
|
||||
msg->body.decode.dt_field_mode = buf->base.interlaced;
|
||||
msg->body.decode.dt_field_mode = buf->base.interlaced;
|
||||
|
||||
si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
|
||||
si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
|
||||
|
||||
return luma->buffer.buf;
|
||||
return luma->buffer.buf;
|
||||
}
|
||||
|
||||
/* get the radeon resources for VCE */
|
||||
static void si_vce_get_buffer(struct pipe_resource *resource,
|
||||
struct pb_buffer **handle,
|
||||
struct radeon_surf **surface)
|
||||
static void si_vce_get_buffer(struct pipe_resource *resource, struct pb_buffer **handle,
|
||||
struct radeon_surf **surface)
|
||||
{
|
||||
struct si_texture *res = (struct si_texture *)resource;
|
||||
struct si_texture *res = (struct si_texture *)resource;
|
||||
|
||||
if (handle)
|
||||
*handle = res->buffer.buf;
|
||||
if (handle)
|
||||
*handle = res->buffer.buf;
|
||||
|
||||
if (surface)
|
||||
*surface = &res->surface;
|
||||
if (surface)
|
||||
*surface = &res->surface;
|
||||
}
|
||||
|
||||
/**
|
||||
* creates an UVD compatible decoder
|
||||
*/
|
||||
struct pipe_video_codec *si_uvd_create_decoder(struct pipe_context *context,
|
||||
const struct pipe_video_codec *templ)
|
||||
const struct pipe_video_codec *templ)
|
||||
{
|
||||
struct si_context *ctx = (struct si_context *)context;
|
||||
bool vcn = ctx->family >= CHIP_RAVEN;
|
||||
struct si_context *ctx = (struct si_context *)context;
|
||||
bool vcn = ctx->family >= CHIP_RAVEN;
|
||||
|
||||
if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
|
||||
if (vcn) {
|
||||
return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
|
||||
} else {
|
||||
if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
|
||||
return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
|
||||
else
|
||||
return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
|
||||
}
|
||||
}
|
||||
if (templ->entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
|
||||
if (vcn) {
|
||||
return radeon_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
|
||||
} else {
|
||||
if (u_reduce_video_profile(templ->profile) == PIPE_VIDEO_FORMAT_HEVC)
|
||||
return radeon_uvd_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
|
||||
else
|
||||
return si_vce_create_encoder(context, templ, ctx->ws, si_vce_get_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
return (vcn) ? radeon_create_decoder(context, templ) :
|
||||
si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
|
||||
return (vcn) ? radeon_create_decoder(context, templ)
|
||||
: si_common_uvd_create_decoder(context, templ, si_uvd_set_dtb);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue