radv: emit geometry ring size and pointers via preamble (v2)

This uses the scratch infrastructure to handle the esgs
and gsvs rings.

(this replaces the old code that did this with patching).

v2: fix correct ring sizes, reset sizes (Bas)

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Signed-off-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
Dave Airlie 2017-01-20 11:06:52 +10:00
parent 8f41fe4389
commit 1fa5b755c2
3 changed files with 230 additions and 11 deletions

View file

@ -1457,12 +1457,17 @@ static void radv_reset_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
cmd_buffer->scratch_size_needed = 0;
cmd_buffer->compute_scratch_size_needed = 0;
cmd_buffer->esgs_ring_size_needed = 0;
cmd_buffer->gsvs_ring_size_needed = 0;
if (cmd_buffer->upload.upload_bo)
cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
cmd_buffer->upload.upload_bo, 8);
cmd_buffer->upload.offset = 0;
cmd_buffer->record_fail = false;
cmd_buffer->ring_offsets_idx = -1;
}
VkResult radv_ResetCommandBuffer(
@ -1649,6 +1654,7 @@ VkResult radv_EndCommandBuffer(
if (cmd_buffer->queue_family_index != RADV_QUEUE_TRANSFER)
si_emit_cache_flush(cmd_buffer);
if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs) ||
cmd_buffer->record_fail)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
@ -1735,6 +1741,20 @@ void radv_CmdBindPipeline(
radv_dynamic_state_copy(&cmd_buffer->state.dynamic,
&pipeline->dynamic_state,
pipeline->dynamic_state_mask);
if (pipeline->graphics.esgs_ring_size > cmd_buffer->esgs_ring_size_needed)
cmd_buffer->esgs_ring_size_needed = pipeline->graphics.esgs_ring_size;
if (pipeline->graphics.gsvs_ring_size > cmd_buffer->gsvs_ring_size_needed)
cmd_buffer->gsvs_ring_size_needed = pipeline->graphics.gsvs_ring_size;
if (radv_pipeline_has_gs(pipeline)) {
struct ac_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
AC_UD_SCRATCH_RING_OFFSETS);
if (cmd_buffer->ring_offsets_idx == -1)
cmd_buffer->ring_offsets_idx = loc->sgpr_idx;
else if (loc->sgpr_idx != -1)
assert(loc->sgpr_idx != cmd_buffer->ring_offsets_idx);
}
break;
default:
assert(!"invalid bind point");
@ -1887,6 +1907,17 @@ void radv_CmdExecuteCommands(
primary->compute_scratch_size_needed = MAX2(primary->compute_scratch_size_needed,
secondary->compute_scratch_size_needed);
if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
if (secondary->ring_offsets_idx != -1) {
if (primary->ring_offsets_idx == -1)
primary->ring_offsets_idx = secondary->ring_offsets_idx;
else
assert(secondary->ring_offsets_idx == primary->ring_offsets_idx);
}
primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
}

View file

@ -764,6 +764,10 @@ radv_queue_finish(struct radv_queue *queue)
queue->device->ws->buffer_destroy(queue->descriptor_bo);
if (queue->scratch_bo)
queue->device->ws->buffer_destroy(queue->scratch_bo);
if (queue->esgs_ring_bo)
queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
if (queue->gsvs_ring_bo)
queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
if (queue->compute_scratch_bo)
queue->device->ws->buffer_destroy(queue->compute_scratch_bo);
}
@ -1046,24 +1050,118 @@ static void radv_dump_trace(struct radv_device *device,
fclose(f);
}
static void
fill_geom_rings(struct radv_queue *queue,
uint32_t *map,
uint32_t esgs_ring_size,
struct radeon_winsys_bo *esgs_ring_bo,
uint32_t gsvs_ring_size,
struct radeon_winsys_bo *gsvs_ring_bo)
{
uint64_t esgs_va, gsvs_va;
esgs_va = queue->device->ws->buffer_get_va(esgs_ring_bo);
gsvs_va = queue->device->ws->buffer_get_va(gsvs_ring_bo);
uint32_t *desc = &map[4];
/* stride 0, num records - size, add tid, swizzle, elsize4,
index stride 64 */
desc[0] = esgs_va;
desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32) |
S_008F04_STRIDE(0) |
S_008F04_SWIZZLE_ENABLE(true);
desc[2] = esgs_ring_size;
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
S_008F0C_ELEMENT_SIZE(1) |
S_008F0C_INDEX_STRIDE(3) |
S_008F0C_ADD_TID_ENABLE(true);
desc += 4;
/* GS entry for ES->GS ring */
/* stride 0, num records - size, elsize0,
index stride 0 */
desc[0] = esgs_va;
desc[1] = S_008F04_BASE_ADDRESS_HI(esgs_va >> 32)|
S_008F04_STRIDE(0) |
S_008F04_SWIZZLE_ENABLE(false);
desc[2] = esgs_ring_size;
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
S_008F0C_ELEMENT_SIZE(0) |
S_008F0C_INDEX_STRIDE(0) |
S_008F0C_ADD_TID_ENABLE(false);
desc += 4;
/* VS entry for GS->VS ring */
/* stride 0, num records - size, elsize0,
index stride 0 */
desc[0] = gsvs_va;
desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
S_008F04_STRIDE(0) |
S_008F04_SWIZZLE_ENABLE(false);
desc[2] = gsvs_ring_size;
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
S_008F0C_ELEMENT_SIZE(0) |
S_008F0C_INDEX_STRIDE(0) |
S_008F0C_ADD_TID_ENABLE(false);
desc += 4;
/* stride gsvs_itemsize, num records 64
elsize 4, index stride 16 */
/* shader will patch stride and desc[2] */
desc[0] = gsvs_va;
desc[1] = S_008F04_BASE_ADDRESS_HI(gsvs_va >> 32)|
S_008F04_STRIDE(0) |
S_008F04_SWIZZLE_ENABLE(true);
desc[2] = 0;
desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
S_008F0C_ELEMENT_SIZE(1) |
S_008F0C_INDEX_STRIDE(1) |
S_008F0C_ADD_TID_ENABLE(true);
}
static VkResult
radv_get_preamble_cs(struct radv_queue *queue,
uint32_t scratch_size,
uint32_t compute_scratch_size,
uint32_t esgs_ring_size,
uint32_t gsvs_ring_size,
struct radeon_winsys_cs **preamble_cs)
{
struct radeon_winsys_bo *scratch_bo = NULL;
struct radeon_winsys_bo *descriptor_bo = NULL;
struct radeon_winsys_bo *compute_scratch_bo = NULL;
struct radeon_winsys_bo *esgs_ring_bo = NULL;
struct radeon_winsys_bo *gsvs_ring_bo = NULL;
struct radeon_winsys_cs *cs = NULL;
if (!scratch_size && !compute_scratch_size) {
if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size) {
*preamble_cs = NULL;
return VK_SUCCESS;
}
if (scratch_size <= queue->scratch_size &&
compute_scratch_size <= queue->compute_scratch_size) {
compute_scratch_size <= queue->compute_scratch_size &&
esgs_ring_size <= queue->esgs_ring_size &&
gsvs_ring_size <= queue->gsvs_ring_size) {
*preamble_cs = queue->preamble_cs;
return VK_SUCCESS;
}
@ -1091,9 +1189,43 @@ radv_get_preamble_cs(struct radv_queue *queue,
} else
compute_scratch_bo = queue->compute_scratch_bo;
if (scratch_bo != queue->scratch_bo) {
if (esgs_ring_size > queue->esgs_ring_size) {
esgs_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
esgs_ring_size,
4096,
RADEON_DOMAIN_VRAM,
RADEON_FLAG_NO_CPU_ACCESS);
if (!esgs_ring_bo)
goto fail;
} else {
esgs_ring_bo = queue->esgs_ring_bo;
esgs_ring_size = queue->esgs_ring_size;
}
if (gsvs_ring_size > queue->gsvs_ring_size) {
gsvs_ring_bo = queue->device->ws->buffer_create(queue->device->ws,
gsvs_ring_size,
4096,
RADEON_DOMAIN_VRAM,
RADEON_FLAG_NO_CPU_ACCESS);
if (!gsvs_ring_bo)
goto fail;
} else {
gsvs_ring_bo = queue->gsvs_ring_bo;
gsvs_ring_size = queue->gsvs_ring_size;
}
if (scratch_bo != queue->scratch_bo ||
esgs_ring_bo != queue->esgs_ring_bo ||
gsvs_ring_bo != queue->gsvs_ring_bo) {
uint32_t size = 0;
if (gsvs_ring_bo || esgs_ring_bo)
size = 80; /* 2 dword + 2 padding + 4 dword * 4 */
else if (scratch_bo)
size = 8; /* 2 dword */
descriptor_bo = queue->device->ws->buffer_create(queue->device->ws,
8,
size,
4096,
RADEON_DOMAIN_VRAM,
RADEON_FLAG_CPU_ACCESS);
@ -1111,22 +1243,49 @@ radv_get_preamble_cs(struct radv_queue *queue,
if (scratch_bo)
queue->device->ws->cs_add_buffer(cs, scratch_bo, 8);
if (esgs_ring_bo)
queue->device->ws->cs_add_buffer(cs, esgs_ring_bo, 8);
if (gsvs_ring_bo)
queue->device->ws->cs_add_buffer(cs, gsvs_ring_bo, 8);
if (descriptor_bo)
queue->device->ws->cs_add_buffer(cs, descriptor_bo, 8);
if (descriptor_bo != queue->descriptor_bo) {
uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo);
uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
S_008F04_SWIZZLE_ENABLE(1);
uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);
map[0] = scratch_va;
map[1] = rsrc1;
if (scratch_bo) {
uint64_t scratch_va = queue->device->ws->buffer_get_va(scratch_bo);
uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
S_008F04_SWIZZLE_ENABLE(1);
map[0] = scratch_va;
map[1] = rsrc1;
}
if (esgs_ring_bo || gsvs_ring_bo)
fill_geom_rings(queue, map, esgs_ring_size, esgs_ring_bo, gsvs_ring_size, gsvs_ring_bo);
queue->device->ws->buffer_unmap(descriptor_bo);
}
if (esgs_ring_bo || gsvs_ring_bo) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VS_PARTIAL_FLUSH) | EVENT_INDEX(4));
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
if (queue->device->physical_device->rad_info.chip_class >= CIK) {
radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
radeon_emit(cs, esgs_ring_size >> 8);
radeon_emit(cs, gsvs_ring_size >> 8);
} else {
radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
radeon_emit(cs, esgs_ring_size >> 8);
radeon_emit(cs, gsvs_ring_size >> 8);
}
}
if (descriptor_bo) {
uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
R_00B130_SPI_SHADER_USER_DATA_VS_0,
@ -1178,6 +1337,20 @@ radv_get_preamble_cs(struct radv_queue *queue,
queue->compute_scratch_size = compute_scratch_size;
}
if (esgs_ring_bo != queue->esgs_ring_bo) {
if (queue->esgs_ring_bo)
queue->device->ws->buffer_destroy(queue->esgs_ring_bo);
queue->esgs_ring_bo = esgs_ring_bo;
queue->esgs_ring_size = esgs_ring_size;
}
if (gsvs_ring_bo != queue->gsvs_ring_bo) {
if (queue->gsvs_ring_bo)
queue->device->ws->buffer_destroy(queue->gsvs_ring_bo);
queue->gsvs_ring_bo = gsvs_ring_bo;
queue->gsvs_ring_size = gsvs_ring_size;
}
if (descriptor_bo != queue->descriptor_bo) {
if (queue->descriptor_bo)
queue->device->ws->buffer_destroy(queue->descriptor_bo);
@ -1196,6 +1369,10 @@ fail:
queue->device->ws->buffer_destroy(scratch_bo);
if (compute_scratch_bo && compute_scratch_bo != queue->compute_scratch_bo)
queue->device->ws->buffer_destroy(compute_scratch_bo);
if (esgs_ring_bo && esgs_ring_bo != queue->esgs_ring_bo)
queue->device->ws->buffer_destroy(esgs_ring_bo);
if (gsvs_ring_bo && gsvs_ring_bo != queue->gsvs_ring_bo)
queue->device->ws->buffer_destroy(gsvs_ring_bo);
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
}
@ -1213,6 +1390,7 @@ VkResult radv_QueueSubmit(
uint32_t max_cs_submission = queue->device->trace_bo ? 1 : UINT32_MAX;
uint32_t scratch_size = 0;
uint32_t compute_scratch_size = 0;
uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
struct radeon_winsys_cs *preamble_cs = NULL;
VkResult result;
@ -1226,10 +1404,12 @@ VkResult radv_QueueSubmit(
scratch_size = MAX2(scratch_size, cmd_buffer->scratch_size_needed);
compute_scratch_size = MAX2(compute_scratch_size,
cmd_buffer->compute_scratch_size_needed);
esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
}
}
result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, &preamble_cs);
result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size, esgs_ring_size, gsvs_ring_size, &preamble_cs);
if (result != VK_SUCCESS)
return result;

View file

@ -470,10 +470,14 @@ struct radv_queue {
uint32_t scratch_size;
uint32_t compute_scratch_size;
uint32_t esgs_ring_size;
uint32_t gsvs_ring_size;
struct radeon_winsys_bo *scratch_bo;
struct radeon_winsys_bo *descriptor_bo;
struct radeon_winsys_bo *compute_scratch_bo;
struct radeon_winsys_bo *esgs_ring_bo;
struct radeon_winsys_bo *gsvs_ring_bo;
struct radeon_winsys_cs *preamble_cs;
};
@ -742,6 +746,10 @@ struct radv_cmd_buffer {
uint32_t scratch_size_needed;
uint32_t compute_scratch_size_needed;
uint32_t esgs_ring_size_needed;
uint32_t gsvs_ring_size_needed;
int ring_offsets_idx; /* just used for verification */
};
struct radv_image;