diff --git a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build index b6b69d00cfd..1c4c7e81bb1 100644 --- a/src/panfrost/vulkan/meson.build +++ b/src/panfrost/vulkan/meson.build @@ -81,6 +81,7 @@ foreach arch : [6, 7, 9, 10] 'panvk_vX_descriptor_set.c', 'panvk_vX_descriptor_set_layout.c', 'panvk_vX_nir_lower_descriptors.c', + 'panvk_vX_shader.c', ] else per_arch_files = common_per_arch_files diff --git a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h index 7f9c6ca18a9..65724b2de83 100644 --- a/src/panfrost/vulkan/panvk_shader.h +++ b/src/panfrost/vulkan/panvk_shader.h @@ -149,7 +149,19 @@ struct panvk_shader { uint32_t bin_size; struct panvk_priv_mem code_mem; + +#if PAN_ARCH <= 7 struct panvk_priv_mem rsd; +#else + union { + struct panvk_priv_mem spd; + struct { + struct panvk_priv_mem pos_points; + struct panvk_priv_mem pos_triangles; + struct panvk_priv_mem var; + } spds; + }; +#endif const char *nir_str; const char *asm_str; diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index 92dd7e36e33..0c794996e47 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -29,7 +29,11 @@ #include "genxml/gen_macros.h" +/* FIXME: make the include statement unconditional when the CSF command buffer + * logic is implemented. */ +#if PAN_ARCH <= 7 #include "panvk_cmd_buffer.h" +#endif #include "panvk_device.h" #include "panvk_instance.h" #include "panvk_mempool.h" @@ -122,11 +126,13 @@ panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data) bit_size, num_comps); break; +#if PAN_ARCH <= 7 case nir_intrinsic_load_layer_id: assert(b->shader->info.stage == MESA_SHADER_FRAGMENT); val = load_sysval_from_push_const(b, SYSVAL(graphics, layer_id), bit_size, num_comps); break; +#endif default: return false; @@ -138,6 +144,7 @@ panvk_lower_sysvals(nir_builder *b, nir_instr *instr, void *data) return true; } +#if PAN_ARCH <= 7 static bool lower_gl_pos_layer_writes(nir_builder *b, nir_instr *instr, void *data) { @@ -213,6 +220,7 @@ lower_layer_writes(nir_shader *nir) nir, lower_gl_pos_layer_writes, nir_metadata_block_index | nir_metadata_dominance, temp_layer_var); } +#endif static void shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align) @@ -232,7 +240,8 @@ panvk_buffer_ubo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness) case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT: case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT: case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT: - return nir_address_format_32bit_index_offset; + return PAN_ARCH <= 7 ? nir_address_format_32bit_index_offset + : nir_address_format_vec2_index_32bit_offset; default: unreachable("Invalid robust buffer access behavior"); } @@ -243,10 +252,12 @@ panvk_buffer_ssbo_addr_format(VkPipelineRobustnessBufferBehaviorEXT robustness) { switch (robustness) { case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT: - return nir_address_format_64bit_global_32bit_offset; + return PAN_ARCH <= 7 ? nir_address_format_64bit_global_32bit_offset + : nir_address_format_vec2_index_32bit_offset; case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_EXT: case VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_ROBUST_BUFFER_ACCESS_2_EXT: - return nir_address_format_64bit_bounded_global; + return PAN_ARCH <= 7 ? nir_address_format_64bit_bounded_global + : nir_address_format_vec2_index_32bit_offset; default: unreachable("Invalid robust buffer access behavior"); } @@ -282,9 +293,11 @@ panvk_preprocess_nir(UNUSED struct vk_physical_device *vk_pdev, nir_shader *nir) NIR_PASS_V(nir, nir_lower_io_to_temporaries, nir_shader_get_entrypoint(nir), true, true); +#if PAN_ARCH <= 7 /* This needs to be done just after the io_to_temporaries pass, because we * rely on in/out temporaries to collect the final layer_id value. */ NIR_PASS_V(nir, lower_layer_writes); +#endif NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out, UINT32_MAX); @@ -361,6 +374,36 @@ panvk_hash_graphics_state(struct vk_physical_device *device, _mesa_blake3_final(&blake3_ctx, blake3_out); } +#if PAN_ARCH >= 9 +static bool +valhall_pack_buf_idx(nir_builder *b, nir_instr *instr, UNUSED void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic != nir_intrinsic_load_ubo && + intrin->intrinsic != nir_intrinsic_load_ssbo) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + nir_def *index = intrin->src[0].ssa; + + /* The valhall backend expects nir_address_format_32bit_index_offset, + * but address mode is nir_address_format_vec2_index_32bit_offset to allow + * us to store the array size, set and index without losing information + * while walking the descriptor deref chain (needed to do a bound check on + * the array index when we reach the end of the chain). + * Turn it back to nir_address_format_32bit_index_offset after IOs + * have been lowered. */ + nir_def *packed_index = + nir_iadd(b, nir_channel(b, index, 0), nir_channel(b, index, 1)); + nir_src_rewrite(&intrin->src[0], packed_index); + return true; +} +#endif + static void panvk_lower_nir(struct panvk_device *dev, nir_shader *nir, uint32_t set_layout_count, @@ -388,6 +431,11 @@ panvk_lower_nir(struct panvk_device *dev, nir_shader *nir, NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_global, nir_address_format_64bit_global); +#if PAN_ARCH >= 9 + NIR_PASS_V(nir, nir_shader_instructions_pass, valhall_pack_buf_idx, + nir_metadata_block_index | nir_metadata_dominance, NULL); +#endif + if (gl_shader_stage_uses_workgroup(stage)) { if (!nir->info.shared_memory_explicit_layout) { NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, @@ -491,6 +539,7 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir, shader->asm_str = asm_str; } +#if PAN_ARCH <= 7 /* Patch the descriptor count */ shader->info.ubo_count = shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_UBO] + @@ -527,6 +576,7 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir, shader->info.attribute_count = shader->desc_info.others.count[PANVK_BIFROST_DESC_TABLE_IMG] + (nir->info.stage == MESA_SHADER_VERTEX ? MAX_VS_ATTRIBS : 0); +#endif shader->local_size.x = nir->info.workgroup_size[0]; shader->local_size.y = nir->info.workgroup_size[1]; @@ -535,12 +585,36 @@ panvk_compile_nir(struct panvk_device *dev, nir_shader *nir, return VK_SUCCESS; } +#if PAN_ARCH >= 9 +static enum mali_flush_to_zero_mode +shader_ftz_mode(struct panvk_shader *shader) +{ + if (shader->info.ftz_fp32) { + if (shader->info.ftz_fp16) + return MALI_FLUSH_TO_ZERO_MODE_ALWAYS; + else + return MALI_FLUSH_TO_ZERO_MODE_DX11; + } else { + /* We don't have a "flush FP16, preserve FP32" mode, but APIs + * should not be able to generate that. + */ + assert(!shader->info.ftz_fp16 && !shader->info.ftz_fp32); + return MALI_FLUSH_TO_ZERO_MODE_PRESERVE_SUBNORMALS; + } +} +#endif + static VkResult panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader, const VkAllocationCallbacks *pAllocator) { shader->code_mem = (struct panvk_priv_mem){0}; + +#if PAN_ARCH <= 7 shader->rsd = (struct panvk_priv_mem){0}; +#else + shader->spd = (struct panvk_priv_mem){0}; +#endif if (!shader->bin_size) return VK_SUCCESS; @@ -548,6 +622,7 @@ panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader, shader->code_mem = panvk_pool_upload_aligned( &dev->mempools.exec, shader->bin_ptr, shader->bin_size, 128); +#if PAN_ARCH <= 7 if (shader->info.stage == MESA_SHADER_FRAGMENT) return VK_SUCCESS; @@ -557,6 +632,73 @@ panvk_shader_upload(struct panvk_device *dev, struct panvk_shader *shader, pan_shader_prepare_rsd(&shader->info, panvk_shader_get_dev_addr(shader), &cfg); } +#else + if (shader->info.stage != MESA_SHADER_VERTEX) { + shader->spd = panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM); + + pan_pack(panvk_priv_mem_host_addr(shader->spd), SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&shader->info); + + if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT) + cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL; + else if (cfg.stage == MALI_SHADER_STAGE_VERTEX) + cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; + + cfg.register_allocation = + pan_register_allocation(shader->info.work_reg_count); + cfg.binary = panvk_shader_get_dev_addr(shader); + cfg.preload.r48_r63 = (shader->info.preload >> 48); + cfg.flush_to_zero_mode = shader_ftz_mode(shader); + + if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT) + cfg.requires_helper_threads = shader->info.contains_barrier; + } + } else { + shader->spds.pos_points = + panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM); + pan_pack(panvk_priv_mem_host_addr(shader->spds.pos_points), + SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&shader->info); + cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; + cfg.register_allocation = + pan_register_allocation(shader->info.work_reg_count); + cfg.binary = panvk_shader_get_dev_addr(shader); + cfg.preload.r48_r63 = (shader->info.preload >> 48); + cfg.flush_to_zero_mode = shader_ftz_mode(shader); + } + + shader->spds.pos_triangles = + panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM); + pan_pack(panvk_priv_mem_host_addr(shader->spds.pos_triangles), + SHADER_PROGRAM, cfg) { + cfg.stage = pan_shader_stage(&shader->info); + cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF; + cfg.register_allocation = + pan_register_allocation(shader->info.work_reg_count); + cfg.binary = + panvk_shader_get_dev_addr(shader) + shader->info.vs.no_psiz_offset; + cfg.preload.r48_r63 = (shader->info.preload >> 48); + cfg.flush_to_zero_mode = shader_ftz_mode(shader); + } + + if (shader->info.vs.secondary_enable) { + shader->spds.var = + panvk_pool_alloc_desc(&dev->mempools.rw, SHADER_PROGRAM); + pan_pack(panvk_priv_mem_host_addr(shader->spds.var), SHADER_PROGRAM, + cfg) { + unsigned work_count = shader->info.vs.secondary_work_reg_count; + + cfg.stage = pan_shader_stage(&shader->info); + cfg.vertex_warp_limit = MALI_WARP_LIMIT_FULL; + cfg.register_allocation = pan_register_allocation(work_count); + cfg.binary = panvk_shader_get_dev_addr(shader) + + shader->info.vs.secondary_offset; + cfg.preload.r48_r63 = (shader->info.vs.secondary_preload >> 48); + cfg.flush_to_zero_mode = shader_ftz_mode(shader); + } + } + } +#endif return VK_SUCCESS; } @@ -573,8 +715,13 @@ panvk_shader_destroy(struct vk_device *vk_dev, struct vk_shader *vk_shader, ralloc_free((void *)shader->nir_str); panvk_pool_free_mem(&dev->mempools.exec, shader->code_mem); + +#if PAN_ARCH <= 7 panvk_pool_free_mem(&dev->mempools.exec, shader->rsd); panvk_pool_free_mem(&dev->mempools.exec, shader->desc_info.others.map); +#else + panvk_pool_free_mem(&dev->mempools.exec, shader->spd); +#endif free((void *)shader->bin_ptr); vk_shader_free(&dev->vk, pAllocator, &shader->vk); @@ -677,6 +824,8 @@ shader_desc_info_deserialize(struct blob_reader *blob, struct panvk_shader *shader) { shader->desc_info.used_set_mask = blob_read_uint32(blob); + +#if PAN_ARCH <= 7 shader->desc_info.dyn_ubos.count = blob_read_uint32(blob); blob_copy_bytes(blob, shader->desc_info.dyn_ubos.map, shader->desc_info.dyn_ubos.count); @@ -706,6 +855,11 @@ shader_desc_info_deserialize(struct blob_reader *blob, blob_copy_bytes(blob, copy_table, others_count * sizeof(*copy_table)); } +#else + shader->desc_info.dyn_bufs.count = blob_read_uint32(blob); + blob_copy_bytes(blob, shader->desc_info.dyn_bufs.map, + shader->desc_info.dyn_bufs.count); +#endif return VK_SUCCESS; } @@ -776,6 +930,8 @@ static void shader_desc_info_serialize(struct blob *blob, const struct panvk_shader *shader) { blob_write_uint32(blob, shader->desc_info.used_set_mask); + +#if PAN_ARCH <= 7 blob_write_uint32(blob, shader->desc_info.dyn_ubos.count); blob_write_bytes(blob, shader->desc_info.dyn_ubos.map, sizeof(*shader->desc_info.dyn_ubos.map) * @@ -794,6 +950,12 @@ shader_desc_info_serialize(struct blob *blob, const struct panvk_shader *shader) blob_write_bytes(blob, panvk_priv_mem_host_addr(shader->desc_info.others.map), sizeof(uint32_t) * others_count); +#else + blob_write_uint32(blob, shader->desc_info.dyn_bufs.count); + blob_write_bytes(blob, shader->desc_info.dyn_bufs.map, + sizeof(*shader->desc_info.dyn_bufs.map) * + shader->desc_info.dyn_bufs.count); +#endif } static bool @@ -1070,6 +1232,12 @@ panvk_per_arch(link_shaders)(struct panvk_pool *desc_pool, assert(vs); assert(vs->info.stage == MESA_SHADER_VERTEX); + if (PAN_ARCH >= 9) { + link->buf_strides[PANVK_VARY_BUF_GENERAL] = + MAX2(fs->info.varyings.input_count, vs->info.varyings.output_count); + return; + } + collect_varyings_info(vs->info.varyings.output, vs->info.varyings.output_count, &out_vars); @@ -1157,5 +1325,10 @@ const struct vk_device_shader_ops panvk_per_arch(device_shader_ops) = { .compile = panvk_compile_shaders, .deserialize = panvk_deserialize_shader, .cmd_set_dynamic_graphics_state = vk_cmd_set_dynamic_graphics_state, + +/* FIXME: make the assignment unconditional when the CSF command buffer logic is + * implemented. */ +#if PAN_ARCH <= 7 .cmd_bind_shaders = panvk_per_arch(cmd_bind_shaders), +#endif };