panvk: Extend the shader logic to support Valhall

Co-developed-by: Boris Brezillon <boris.brezillon@collabora.com>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30736>
This commit is contained in:
Mary Guillemard 2024-06-28 10:05:15 +02:00 committed by Marge Bot
parent e350c334b6
commit 27beadcbdb
3 changed files with 189 additions and 3 deletions

View file

@ -81,6 +81,7 @@ foreach arch : [6, 7, 9, 10]
'panvk_vX_descriptor_set.c',
'panvk_vX_descriptor_set_layout.c',
'panvk_vX_nir_lower_descriptors.c',
'panvk_vX_shader.c',
]
else
per_arch_files = common_per_arch_files

View file

@ -149,7 +149,19 @@ struct panvk_shader {
uint32_t bin_size;
struct panvk_priv_mem code_mem;
#if PAN_ARCH <= 7
struct panvk_priv_mem rsd;
#else
union {
struct panvk_priv_mem spd;
struct {
struct panvk_priv_mem pos_points;
struct panvk_priv_mem pos_triangles;
struct panvk_priv_mem var;
} spds;
};
#endif
const char *nir_str;
const char *asm_str;

View file

@ -29,7 +29,11 @@
#include "genxml/gen_macros.h"
/* FIXME: make the include statement unconditional when the CSF command buffer
* logic is implemented. */
#if PAN_ARCH <= 7
#include "panvk_cmd_buffer.h"
#endif
#include "panvk_device.h"
#include "panvk_instance.h"
#include "panvk_mempool.h"
@ -122,11 +126,13 @@ panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
bit_size, num_comps);
break;
#if PAN_ARCH <= 7
case nir_intrinsic_load_layer_id:
assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
val = load_sysval_from_push_const(b, SYSVAL(graphics, layer_id), bit_size,
num_comps);
break;
#endif
default:
return false;
@ -138,6 +144,7 @@ panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data)
return true;
}
#if PAN_ARCH <= 7
static bool
lower_gl_pos_layer_writes(nir_builder *b, nir_instr *instr, void *data)
{
@ -213,6 +220,7 @@ lower_layer_writes(nir_shader *nir)
nir, lower_gl_pos_layer_writes,
nir_metadata_block_index | nir_metadata_dominance, temp_layer_var);
}
#endif
static void
shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align)
@ -232,7 +240,8 @@ panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
return nir_address_format_32bit_index_offset;
return PAN_ARCH <= 7 ? nir_address_format_32bit_index_offset
: nir_address_format_vec2_index_32bit_offset;
default:
unreachable("Invalid robust buffer access behavior");
}
@ -243,10 +252,12 @@ panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness)
{
switch (robustness) {
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT:
return nir_address_format_64bit_global_32bit_offset;
return PAN_ARCH <= 7 ? nir_address_format_64bit_global_32bit_offset
: nir_address_format_vec2_index_32bit_offset;
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT:
case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT:
return nir_address_format_64bit_bounded_global;
return PAN_ARCH <= 7 ? nir_address_format_64bit_bounded_global
: nir_address_format_vec2_index_32bit_offset;
default:
unreachable("Invalid robust buffer access behavior");
}
@ -282,9 +293,11 @@ panvk_preprocess_nir(UNUSED struct vk_physical_device *vk_pdev, nir_shader *nir)
NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir),
true, true);
#if PAN_ARCH <= 7
/* This needs to be done just after the io_to_temporaries pass, because we
* rely on in/out temporaries to collect the final layer_id value. */
NIR_PASS_V(nir, lower_layer_writes);
#endif
NIR_PASS_V(nir, nir_lower_indirect_derefs,
nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
@ -361,6 +374,36 @@ panvk_hash_graphics_state(struct vk_physical_device *device,
_mesa_blake3_final(&blake3_ctx, blake3_out);
}
#if PAN_ARCH >= 9
static bool
valhall_pack_buf_idx(nir_builder *b, nir_instr *instr, UNUSED void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_ubo &&
intrin->intrinsic != nir_intrinsic_load_ssbo)
return false;
b->cursor = nir_before_instr(&intrin->instr);
nir_def *index = intrin->src[0].ssa;
/* The valhall backend expects nir_address_format_32bit_index_offset,
* but address mode is nir_address_format_vec2_index_32bit_offset to allow
* us to store the array size, set and index without losing information
* while walking the descriptor deref chain (needed to do a bound check on
* the array index when we reach the end of the chain).
* Turn it back to nir_address_format_32bit_index_offset after IOs
* have been lowered. */
nir_def *packed_index =
nir_iadd(b, nir_channel(b, index, 0), nir_channel(b, index, 1));
nir_src_rewrite(&intrin->src[0], packed_index);
return true;
}
#endif
static void
panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
uint32_t set_layout_count,
@ -388,6 +431,11 @@ panvk_lower_nir(struct panvk_device *dev, nir_shader *nir,
NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_global,
nir_address_format_64bit_global);
#if PAN_ARCH >= 9
NIR_PASS_V(nir, nir_shader_instructions_pass, valhall_pack_buf_idx,
nir_metadata_block_index | nir_metadata_dominance, NULL);
#endif
if (gl_shader_stage_uses_workgroup(stage)) {
if (!nir->info.shared_memory_explicit_layout) {
NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
@ -491,6 +539,7 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
shader->asm_str = asm_str;
}
#if PAN_ARCH <= 7
/* Patch the descriptor count */
shader->info.ubo_count =
shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_UBO] +
@ -527,6 +576,7 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
shader->info.attribute_count =
shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] +
(nir->info.stage == MESA_SHADER_VERTEX ? MAX_VS_ATTRIBS : 0);
#endif
shader->local_size.x = nir->info.workgroup_size[0];
shader->local_size.y = nir->info.workgroup_size[1];
@ -535,12 +585,36 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir,
return VK_SUCCESS;
}
#if PAN_ARCH >= 9
static enum mali_flush_to_zero_mode
shader_ftz_mode(struct panvk_shader *shader)
{
if (shader->info.ftz_fp32) {
if (shader->info.ftz_fp16)
return MALI_FLUSH_TO_ZERO_MODE_ALWAYS;
else
return MALI_FLUSH_TO_ZERO_MODE_DX11;
} else {
/* We don't have a "flush FP16, preserve FP32" mode, but APIs
* should not be able to generate that.
*/
assert(!shader->info.ftz_fp16 && !shader->info.ftz_fp32);
return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS;
}
}
#endif
static VkResult
panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader,
const VkAllocationCallbacks *pAllocator)
{
shader->code_mem = (struct panvk_priv_mem){0};
#if PAN_ARCH <= 7
shader->rsd = (struct panvk_priv_mem){0};
#else
shader->spd = (struct panvk_priv_mem){0};
#endif
if (!shader->bin_size)
return VK_SUCCESS;
@ -548,6 +622,7 @@ panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader,
shader->code_mem = panvk_pool_upload_aligned(
&dev->mempools.exec, shader->bin_ptr, shader->bin_size, 128);
#if PAN_ARCH <= 7
if (shader->info.stage == MESA_SHADER_FRAGMENT)
return VK_SUCCESS;
@ -557,6 +632,73 @@ panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader,
pan_shader_prepare_rsd(&shader->info, panvk_shader_get_dev_addr(shader),
&cfg);
}
#else
if (shader->info.stage != MESA_SHADER_VERTEX) {
shader->spd = panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
pan_pack(panvk_priv_mem_host_addr(shader->spd), SHADER_PROGRAM, cfg) {
cfg.stage = pan_shader_stage(&shader->info);
if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
else if (cfg.stage == MALI_SHADER_STAGE_VERTEX)
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
cfg.binary = panvk_shader_get_dev_addr(shader);
cfg.preload.r48_r63 = (shader->info.preload >> 48);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
cfg.requires_helper_threads = shader->info.contains_barrier;
}
} else {
shader->spds.pos_points =
panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
pan_pack(panvk_priv_mem_host_addr(shader->spds.pos_points),
SHADER_PROGRAM, cfg) {
cfg.stage = pan_shader_stage(&shader->info);
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
cfg.binary = panvk_shader_get_dev_addr(shader);
cfg.preload.r48_r63 = (shader->info.preload >> 48);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
}
shader->spds.pos_triangles =
panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
pan_pack(panvk_priv_mem_host_addr(shader->spds.pos_triangles),
SHADER_PROGRAM, cfg) {
cfg.stage = pan_shader_stage(&shader->info);
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
cfg.binary =
panvk_shader_get_dev_addr(shader) + shader->info.vs.no_psiz_offset;
cfg.preload.r48_r63 = (shader->info.preload >> 48);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
}
if (shader->info.vs.secondary_enable) {
shader->spds.var =
panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM);
pan_pack(panvk_priv_mem_host_addr(shader->spds.var), SHADER_PROGRAM,
cfg) {
unsigned work_count = shader->info.vs.secondary_work_reg_count;
cfg.stage = pan_shader_stage(&shader->info);
cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL;
cfg.register_allocation = pan_register_allocation(work_count);
cfg.binary = panvk_shader_get_dev_addr(shader) +
shader->info.vs.secondary_offset;
cfg.preload.r48_r63 = (shader->info.vs.secondary_preload >> 48);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
}
}
}
#endif
return VK_SUCCESS;
}
@ -573,8 +715,13 @@ panvk_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader,
ralloc_free((void *)shader->nir_str);
panvk_pool_free_mem(&dev->mempools.exec, shader->code_mem);
#if PAN_ARCH <= 7
panvk_pool_free_mem(&dev->mempools.exec, shader->rsd);
panvk_pool_free_mem(&dev->mempools.exec, shader->desc_info.others.map);
#else
panvk_pool_free_mem(&dev->mempools.exec, shader->spd);
#endif
free((void *)shader->bin_ptr);
vk_shader_free(&dev->vk, pAllocator, &shader->vk);
@ -677,6 +824,8 @@ shader_desc_info_deserialize(struct blob_reader *blob,
struct panvk_shader *shader)
{
shader->desc_info.used_set_mask = blob_read_uint32(blob);
#if PAN_ARCH <= 7
shader->desc_info.dyn_ubos.count = blob_read_uint32(blob);
blob_copy_bytes(blob, shader->desc_info.dyn_ubos.map,
shader->desc_info.dyn_ubos.count);
@ -706,6 +855,11 @@ shader_desc_info_deserialize(struct blob_reader *blob,
blob_copy_bytes(blob, copy_table, others_count * sizeof(*copy_table));
}
#else
shader->desc_info.dyn_bufs.count = blob_read_uint32(blob);
blob_copy_bytes(blob, shader->desc_info.dyn_bufs.map,
shader->desc_info.dyn_bufs.count);
#endif
return VK_SUCCESS;
}
@ -776,6 +930,8 @@ static void
shader_desc_info_serialize(struct blob *blob, const struct panvk_shader *shader)
{
blob_write_uint32(blob, shader->desc_info.used_set_mask);
#if PAN_ARCH <= 7
blob_write_uint32(blob, shader->desc_info.dyn_ubos.count);
blob_write_bytes(blob, shader->desc_info.dyn_ubos.map,
sizeof(*shader->desc_info.dyn_ubos.map) *
@ -794,6 +950,12 @@ shader_desc_info_serialize(struct blob *blob, const struct panvk_shader *shader)
blob_write_bytes(blob,
panvk_priv_mem_host_addr(shader->desc_info.others.map),
sizeof(uint32_t) * others_count);
#else
blob_write_uint32(blob, shader->desc_info.dyn_bufs.count);
blob_write_bytes(blob, shader->desc_info.dyn_bufs.map,
sizeof(*shader->desc_info.dyn_bufs.map) *
shader->desc_info.dyn_bufs.count);
#endif
}
static bool
@ -1070,6 +1232,12 @@ panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool,
assert(vs);
assert(vs->info.stage == MESA_SHADER_VERTEX);
if (PAN_ARCH >= 9) {
link->buf_strides[PANVK_VARY_BUF_GENERAL] =
MAX2(fs->info.varyings.input_count, vs->info.varyings.output_count);
return;
}
collect_varyings_info(vs->info.varyings.output,
vs->info.varyings.output_count, &out_vars);
@ -1157,5 +1325,10 @@ const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = {
.compile = panvk_compile_shaders,
.deserialize = panvk_deserialize_shader,
.cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state,
/* FIXME: make the assignment unconditional when the CSF command buffer logic is
* implemented. */
#if PAN_ARCH <= 7
.cmd_bind_shaders = panvk_per_arch(cmd_bind_shaders),
#endif
};