mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-09 06:10:12 +01:00
panvk: Use LD_VAR[_IMM] + ADs for varyings
The current implementation uses LD_VAR_BUF[_IMM] to look up varyings, which limits the number of varying components to 64 due to an 8-bit offset value. As this does not align to maxVertexOutputComponents (128), this change replaces the use of LD_VAR_BUF[_IMM] with LD_VAR[_IMM] + Attribute Descriptors, which do not have this limitation. As allocating Attribute Descriptors is potentially expensive, this can be further optimized by falling back to LD_VAR_BUF[_IMM] in cases where we can ensure we do not use more than 64 varying components. This change currently does not change behavior for gallium/panfrost, though that should be done as well. Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32969>
This commit is contained in:
parent
7881d19d01
commit
6d5ae5b3af
7 changed files with 146 additions and 57 deletions
|
|
@ -132,6 +132,10 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
|
|||
.gpu_id = panfrost_device_gpu_id(dev),
|
||||
};
|
||||
|
||||
if (dev->arch >= 9)
|
||||
/* Use LD_VAR_BUF for varying lookups. */
|
||||
inputs.valhall.use_ld_var_buf = true;
|
||||
|
||||
/* Lower this early so the backends don't have to worry about it */
|
||||
if (s->info.stage == MESA_SHADER_FRAGMENT) {
|
||||
inputs.fixed_varying_mask = key->fs.fixed_varying_mask;
|
||||
|
|
|
|||
|
|
@ -273,19 +273,10 @@ dEQP-VK.api.device_init.create_device_global_priority_query_khr.basic,Fail
|
|||
dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.63,Fail
|
||||
dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.63,Fail
|
||||
|
||||
dEQP-VK.glsl.limits.near_max.fragment_input.components_123,Fail
|
||||
dEQP-VK.glsl.limits.near_max.fragment_input.components_124,Fail
|
||||
|
||||
dEQP-VK.pipeline.monolithic.max_varyings.test_vertex_io_between_vertex_fragment,Fail
|
||||
|
||||
dEQP-VK.pipeline.pipeline_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail
|
||||
|
||||
dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.63,Fail
|
||||
dEQP-VK.renderpass.multiple_subpasses_multiple_command_buffers.test,Fail
|
||||
dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.63,Fail
|
||||
|
||||
dEQP-VK.pipeline.fast_linked_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail
|
||||
|
||||
dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex,Crash
|
||||
|
||||
dEQP-VK.rasterization.rasterization_order_attachment_access.depth.samples_1.multi_draw_barriers,Crash
|
||||
|
|
|
|||
|
|
@ -589,45 +589,30 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
|
|||
b->shader->info.bifrost->uses_flat_shading = true;
|
||||
}
|
||||
|
||||
enum bi_source_format source_format =
|
||||
smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
|
||||
|
||||
nir_src *offset = nir_get_io_offset_src(instr);
|
||||
unsigned imm_index = 0;
|
||||
bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index);
|
||||
unsigned base = nir_intrinsic_base(instr);
|
||||
|
||||
/* On Valhall, ensure the table and index are valid for usage with immediate
|
||||
* form when IDVS isn't used */
|
||||
if (b->shader->arch >= 9 && !b->shader->malloc_idvs)
|
||||
immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
|
||||
pan_res_handle_get_index(base) < 256;
|
||||
/* LD_VAR_BUF[_IMM] takes an 8-bit offset, limiting its use to 64 or less
|
||||
* varying components, assuming F32.
|
||||
* Therefore, only use LD_VAR_BUF[_IMM] if explicitly told by the driver
|
||||
* through a compiler input value, falling back to LD_VAR[_IMM] +
|
||||
* Attribute Descriptors otherwise. */
|
||||
bool use_ld_var_buf =
|
||||
b->shader->malloc_idvs && b->shader->inputs->valhall.use_ld_var_buf;
|
||||
|
||||
if (b->shader->malloc_idvs && immediate) {
|
||||
/* Immediate index given in bytes. */
|
||||
bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
|
||||
update, vecsize,
|
||||
bi_varying_offset(b->shader, instr));
|
||||
} else if (immediate) {
|
||||
bi_instr *I;
|
||||
if (use_ld_var_buf) {
|
||||
enum bi_source_format source_format =
|
||||
smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32;
|
||||
|
||||
if (smooth) {
|
||||
I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
|
||||
pan_res_handle_get_index(imm_index));
|
||||
if (immediate) {
|
||||
/* Immediate index given in bytes. */
|
||||
bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
|
||||
update, vecsize,
|
||||
bi_varying_offset(b->shader, instr));
|
||||
} else {
|
||||
I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
|
||||
pan_res_handle_get_index(imm_index));
|
||||
}
|
||||
|
||||
/* Valhall usually uses machine-allocated IDVS. If this is disabled,
|
||||
* use a simple Midgard-style ABI.
|
||||
*/
|
||||
if (b->shader->arch >= 9)
|
||||
I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
|
||||
} else {
|
||||
bi_index idx = bi_src_index(offset);
|
||||
|
||||
if (b->shader->malloc_idvs) {
|
||||
bi_index idx = bi_src_index(offset);
|
||||
/* Index needs to be in bytes, but NIR gives the index
|
||||
* in slots. For now assume 16 bytes per element.
|
||||
*/
|
||||
|
|
@ -639,7 +624,33 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr)
|
|||
|
||||
bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample,
|
||||
source_format, update, vecsize);
|
||||
}
|
||||
} else {
|
||||
/* On Valhall, ensure the table and index are valid for usage with
|
||||
* immediate form when IDVS isn't used */
|
||||
if (b->shader->arch >= 9)
|
||||
immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) &&
|
||||
pan_res_handle_get_index(base) < 256;
|
||||
|
||||
if (immediate) {
|
||||
bi_instr *I;
|
||||
|
||||
if (smooth) {
|
||||
I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize,
|
||||
pan_res_handle_get_index(imm_index));
|
||||
} else {
|
||||
I =
|
||||
bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize,
|
||||
pan_res_handle_get_index(imm_index));
|
||||
}
|
||||
|
||||
/* Valhall usually uses LD_VAR_BUF. If this is disabled, use a simple
|
||||
* Midgard-style ABI. */
|
||||
if (b->shader->arch >= 9)
|
||||
I->table = va_res_fold_table_idx(pan_res_handle_get_table(base));
|
||||
} else {
|
||||
bi_index idx = bi_src_index(offset);
|
||||
|
||||
if (base != 0)
|
||||
idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false);
|
||||
|
||||
|
|
|
|||
|
|
@ -121,6 +121,10 @@ struct panfrost_compile_inputs {
|
|||
struct {
|
||||
uint32_t rt_conv[8];
|
||||
} bifrost;
|
||||
struct {
|
||||
/* Use LD_VAR_BUF[_IMM] instead of LD_VAR[_IMM] to load varyings. */
|
||||
bool use_ld_var_buf;
|
||||
} valhall;
|
||||
};
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -165,6 +165,73 @@ prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
|
|||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
get_varying_slots(const struct panvk_cmd_buffer *cmdbuf)
|
||||
{
|
||||
const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
|
||||
const struct panvk_shader *fs = get_fs(cmdbuf);
|
||||
uint32_t varying_slots = 0;
|
||||
|
||||
if (fs) {
|
||||
unsigned vs_vars = vs->info.varyings.output_count;
|
||||
unsigned fs_vars = fs->info.varyings.input_count;
|
||||
varying_slots = MAX2(vs_vars, fs_vars);
|
||||
}
|
||||
|
||||
return varying_slots;
|
||||
}
|
||||
|
||||
static void
|
||||
emit_varying_descs(const struct panvk_cmd_buffer *cmdbuf,
|
||||
struct mali_attribute_packed *descs)
|
||||
{
|
||||
uint32_t varying_slots = get_varying_slots(cmdbuf);
|
||||
/* Assumes 16 byte slots. We could do better. */
|
||||
uint32_t varying_size = varying_slots * 16;
|
||||
|
||||
const struct panvk_shader *fs = get_fs(cmdbuf);
|
||||
|
||||
for (uint32_t i = 0; i < varying_slots; i++) {
|
||||
const struct pan_shader_varying *var = &fs->info.varyings.input[i];
|
||||
/* Skip special varyings. */
|
||||
if (var->location < VARYING_SLOT_VAR0)
|
||||
continue;
|
||||
|
||||
/* We currently always write out F32 in the vertex shaders, so the format
|
||||
* needs to reflect this. */
|
||||
enum pipe_format f = var->format;
|
||||
switch (f) {
|
||||
case PIPE_FORMAT_R16_FLOAT:
|
||||
f = PIPE_FORMAT_R32_FLOAT;
|
||||
break;
|
||||
case PIPE_FORMAT_R16G16_FLOAT:
|
||||
f = PIPE_FORMAT_R32G32_FLOAT;
|
||||
break;
|
||||
case PIPE_FORMAT_R16G16B16_FLOAT:
|
||||
f = PIPE_FORMAT_R32G32B32_FLOAT;
|
||||
break;
|
||||
case PIPE_FORMAT_R16G16B16A16_FLOAT:
|
||||
f = PIPE_FORMAT_R32G32B32A32_FLOAT;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
uint32_t loc = var->location - VARYING_SLOT_VAR0;
|
||||
pan_pack(&descs[i], ATTRIBUTE, cfg) {
|
||||
cfg.attribute_type = MALI_ATTRIBUTE_TYPE_VERTEX_PACKET;
|
||||
cfg.offset_enable = false;
|
||||
cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
|
||||
cfg.table = 61;
|
||||
cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
|
||||
cfg.offset = 1024 + (loc * 16);
|
||||
cfg.buffer_index = 0;
|
||||
cfg.attribute_stride = varying_size;
|
||||
cfg.packet_stride = varying_size + 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static VkResult
|
||||
prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
|
||||
{
|
||||
|
|
@ -172,7 +239,7 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
|
|||
const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
|
||||
const struct panvk_descriptor_state *desc_state =
|
||||
&cmdbuf->state.gfx.desc_state;
|
||||
uint32_t desc_count = fs->desc_info.dyn_bufs.count + 1;
|
||||
uint32_t desc_count = fs->desc_info.dyn_bufs.count + MAX_VARYING + 1;
|
||||
struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
|
||||
cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
|
||||
struct panvk_opaque_desc *descs = driver_set.cpu;
|
||||
|
|
@ -180,13 +247,15 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
|
|||
if (desc_count && !driver_set.gpu)
|
||||
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
|
||||
|
||||
/* Dummy sampler always comes first. */
|
||||
pan_cast_and_pack(&descs[0], SAMPLER, cfg) {
|
||||
emit_varying_descs(cmdbuf, (struct mali_attribute_packed *)(&descs[0]));
|
||||
|
||||
/* Dummy sampler always comes right after the varyings. */
|
||||
pan_cast_and_pack(&descs[MAX_VARYING], SAMPLER, cfg) {
|
||||
cfg.clamp_integer_array_indices = false;
|
||||
}
|
||||
|
||||
panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, fs,
|
||||
(struct mali_buffer_packed *)(&descs[1]));
|
||||
panvk_per_arch(cmd_fill_dyn_bufs)(
|
||||
desc_state, fs, (struct mali_buffer_packed *)(&descs[1 + MAX_VARYING]));
|
||||
|
||||
fs_desc_state->driver_set.dev_addr = driver_set.gpu;
|
||||
fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
|
||||
|
|
@ -1650,16 +1719,8 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
|
|||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
|
||||
uint32_t varying_size = 0;
|
||||
|
||||
if (fs) {
|
||||
unsigned vs_vars = vs->info.varyings.output_count;
|
||||
unsigned fs_vars = fs->info.varyings.input_count;
|
||||
unsigned var_slots = MAX2(vs_vars, fs_vars);
|
||||
|
||||
/* Assumes 16 byte slots. We could do better. */
|
||||
varying_size = var_slots * 16;
|
||||
}
|
||||
/* Assumes 16 byte slots. We could do better. */
|
||||
uint32_t varying_size = get_varying_slots(cmdbuf) * 16;
|
||||
|
||||
cs_update_vt_ctx(b) {
|
||||
/* We don't use the resource dep system yet. */
|
||||
|
|
|
|||
|
|
@ -1025,8 +1025,22 @@ create_copy_table(nir_shader *nir, struct lower_desc_ctx *ctx)
|
|||
for (uint32_t i = 0; i < PANVK_BIFROST_DESC_TABLE_COUNT; i++)
|
||||
copy_count += desc_info->others[i].count;
|
||||
#else
|
||||
/* Dummy sampler comes after the vertex attributes. */
|
||||
uint32_t dummy_sampler_idx = nir->info.stage == MESA_SHADER_VERTEX ? 16 : 0;
|
||||
uint32_t dummy_sampler_idx;
|
||||
switch (nir->info.stage) {
|
||||
case MESA_SHADER_VERTEX:
|
||||
/* Dummy sampler comes after the vertex attributes. */
|
||||
dummy_sampler_idx = 16;
|
||||
break;
|
||||
case MESA_SHADER_FRAGMENT:
|
||||
/* Dummy sampler comes after the varyings. */
|
||||
dummy_sampler_idx = MAX_VARYING;
|
||||
break;
|
||||
case MESA_SHADER_COMPUTE:
|
||||
dummy_sampler_idx = 0;
|
||||
break;
|
||||
default:
|
||||
unreachable("unexpected stage");
|
||||
}
|
||||
desc_info->dummy_sampler_handle = pan_res_handle(0, dummy_sampler_idx);
|
||||
|
||||
copy_count = desc_info->dyn_bufs.count + desc_info->dyn_bufs.count;
|
||||
|
|
|
|||
|
|
@ -1041,6 +1041,10 @@ panvk_compile_shader(struct panvk_device *dev,
|
|||
.gpu_id = phys_dev->kmod.props.gpu_prod_id,
|
||||
.no_ubo_to_push = true,
|
||||
.view_mask = (state && state->rp) ? state->rp->view_mask : 0,
|
||||
#if PAN_ARCH >= 9
|
||||
/* LD_VAR_BUF does not support maxVertexOutputComponents (128) */
|
||||
.valhall.use_ld_var_buf = false,
|
||||
#endif
|
||||
};
|
||||
|
||||
if (info->stage == MESA_SHADER_FRAGMENT && state != NULL &&
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue