diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index d9de7e191e9..f0ad06fee4c 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -132,6 +132,10 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, .gpu_id = panfrost_device_gpu_id(dev), }; + if (dev->arch >= 9) + /* Use LD_VAR_BUF for varying lookups. */ + inputs.valhall.use_ld_var_buf = true; + /* Lower this early so the backends don't have to worry about it */ if (s->info.stage == MESA_SHADER_FRAGMENT) { inputs.fixed_varying_mask = key->fs.fixed_varying_mask; diff --git a/src/panfrost/ci/panfrost-g610-fails.txt b/src/panfrost/ci/panfrost-g610-fails.txt index f3ef3c2f4a1..0fbf80fc4f8 100644 --- a/src/panfrost/ci/panfrost-g610-fails.txt +++ b/src/panfrost/ci/panfrost-g610-fails.txt @@ -273,19 +273,10 @@ dEQP-VK.api.device_init.create_device_global_priority_query_khr.basic,Fail dEQP-VK.renderpass.dedicated_allocation.attachment_allocation.input_output.63,Fail dEQP-VK.renderpass2.dedicated_allocation.attachment_allocation.input_output.63,Fail -dEQP-VK.glsl.limits.near_max.fragment_input.components_123,Fail -dEQP-VK.glsl.limits.near_max.fragment_input.components_124,Fail - -dEQP-VK.pipeline.monolithic.max_varyings.test_vertex_io_between_vertex_fragment,Fail - -dEQP-VK.pipeline.pipeline_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail - dEQP-VK.renderpass.suballocation.attachment_allocation.input_output.63,Fail dEQP-VK.renderpass.multiple_subpasses_multiple_command_buffers.test,Fail dEQP-VK.renderpass2.suballocation.attachment_allocation.input_output.63,Fail -dEQP-VK.pipeline.fast_linked_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail - dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex,Crash dEQP-VK.rasterization.rasterization_order_attachment_access.depth.samples_1.multi_draw_barriers,Crash diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index 6d55771b7c0..cee274b1918 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -589,45 +589,30 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) b->shader->info.bifrost->uses_flat_shading = true; } - enum bi_source_format source_format = - smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32; - nir_src *offset = nir_get_io_offset_src(instr); unsigned imm_index = 0; bool immediate = bi_is_imm_var_desc_handle(b, instr, &imm_index); unsigned base = nir_intrinsic_base(instr); - /* On Valhall, ensure the table and index are valid for usage with immediate - * form when IDVS isn't used */ - if (b->shader->arch >= 9 && !b->shader->malloc_idvs) - immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) && - pan_res_handle_get_index(base) < 256; + /* LD_VAR_BUF[_IMM] takes an 8-bit offset, limiting its use to 64 or less + * varying components, assuming F32. + * Therefore, only use LD_VAR_BUF[_IMM] if explicitly told by the driver + * through a compiler input value, falling back to LD_VAR[_IMM] + + * Attribute Descriptors otherwise. */ + bool use_ld_var_buf = + b->shader->malloc_idvs && b->shader->inputs->valhall.use_ld_var_buf; - if (b->shader->malloc_idvs && immediate) { - /* Immediate index given in bytes. */ - bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format, - update, vecsize, - bi_varying_offset(b->shader, instr)); - } else if (immediate) { - bi_instr *I; + if (use_ld_var_buf) { + enum bi_source_format source_format = + smooth ? BI_SOURCE_FORMAT_F32 : BI_SOURCE_FORMAT_FLAT32; - if (smooth) { - I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize, - pan_res_handle_get_index(imm_index)); + if (immediate) { + /* Immediate index given in bytes. */ + bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format, + update, vecsize, + bi_varying_offset(b->shader, instr)); } else { - I = bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize, - pan_res_handle_get_index(imm_index)); - } - - /* Valhall usually uses machine-allocated IDVS. If this is disabled, - * use a simple Midgard-style ABI. - */ - if (b->shader->arch >= 9) - I->table = va_res_fold_table_idx(pan_res_handle_get_table(base)); - } else { - bi_index idx = bi_src_index(offset); - - if (b->shader->malloc_idvs) { + bi_index idx = bi_src_index(offset); /* Index needs to be in bytes, but NIR gives the index * in slots. For now assume 16 bytes per element. */ @@ -639,7 +624,33 @@ bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) bi_ld_var_buf_to(b, sz, dest, src0, idx_bytes, regfmt, sample, source_format, update, vecsize); + } + } else { + /* On Valhall, ensure the table and index are valid for usage with + * immediate form when IDVS isn't used */ + if (b->shader->arch >= 9) + immediate &= va_is_valid_const_table(pan_res_handle_get_table(base)) && + pan_res_handle_get_index(base) < 256; + + if (immediate) { + bi_instr *I; + + if (smooth) { + I = bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, vecsize, + pan_res_handle_get_index(imm_index)); + } else { + I = + bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, vecsize, + pan_res_handle_get_index(imm_index)); + } + + /* Valhall usually uses LD_VAR_BUF. If this is disabled, use a simple + * Midgard-style ABI. */ + if (b->shader->arch >= 9) + I->table = va_res_fold_table_idx(pan_res_handle_get_table(base)); } else { + bi_index idx = bi_src_index(offset); + if (base != 0) idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 6ec4a2807c5..32ce7515797 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -121,6 +121,10 @@ struct panfrost_compile_inputs { struct { uint32_t rt_conv[8]; } bifrost; + struct { + /* Use LD_VAR_BUF[_IMM] instead of LD_VAR[_IMM] to load varyings. */ + bool use_ld_var_buf; + } valhall; }; }; diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 18acab6e13b..8df4f0b2f2c 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -165,6 +165,73 @@ prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf) return VK_SUCCESS; } +static uint32_t +get_varying_slots(const struct panvk_cmd_buffer *cmdbuf) +{ + const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader; + const struct panvk_shader *fs = get_fs(cmdbuf); + uint32_t varying_slots = 0; + + if (fs) { + unsigned vs_vars = vs->info.varyings.output_count; + unsigned fs_vars = fs->info.varyings.input_count; + varying_slots = MAX2(vs_vars, fs_vars); + } + + return varying_slots; +} + +static void +emit_varying_descs(const struct panvk_cmd_buffer *cmdbuf, + struct mali_attribute_packed *descs) +{ + uint32_t varying_slots = get_varying_slots(cmdbuf); + /* Assumes 16 byte slots. We could do better. */ + uint32_t varying_size = varying_slots * 16; + + const struct panvk_shader *fs = get_fs(cmdbuf); + + for (uint32_t i = 0; i < varying_slots; i++) { + const struct pan_shader_varying *var = &fs->info.varyings.input[i]; + /* Skip special varyings. */ + if (var->location < VARYING_SLOT_VAR0) + continue; + + /* We currently always write out F32 in the vertex shaders, so the format + * needs to reflect this. */ + enum pipe_format f = var->format; + switch (f) { + case PIPE_FORMAT_R16_FLOAT: + f = PIPE_FORMAT_R32_FLOAT; + break; + case PIPE_FORMAT_R16G16_FLOAT: + f = PIPE_FORMAT_R32G32_FLOAT; + break; + case PIPE_FORMAT_R16G16B16_FLOAT: + f = PIPE_FORMAT_R32G32B32_FLOAT; + break; + case PIPE_FORMAT_R16G16B16A16_FLOAT: + f = PIPE_FORMAT_R32G32B32A32_FLOAT; + break; + default: + break; + } + + uint32_t loc = var->location - VARYING_SLOT_VAR0; + pan_pack(&descs[i], ATTRIBUTE, cfg) { + cfg.attribute_type = MALI_ATTRIBUTE_TYPE_VERTEX_PACKET; + cfg.offset_enable = false; + cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw; + cfg.table = 61; + cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX; + cfg.offset = 1024 + (loc * 16); + cfg.buffer_index = 0; + cfg.attribute_stride = varying_size; + cfg.packet_stride = varying_size + 16; + } + } +} + static VkResult prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf) { @@ -172,7 +239,7 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf) const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader; const struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state; - uint32_t desc_count = fs->desc_info.dyn_bufs.count + 1; + uint32_t desc_count = fs->desc_info.dyn_bufs.count + MAX_VARYING + 1; struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem( cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE); struct panvk_opaque_desc *descs = driver_set.cpu; @@ -180,13 +247,15 @@ prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf) if (desc_count && !driver_set.gpu) return VK_ERROR_OUT_OF_DEVICE_MEMORY; - /* Dummy sampler always comes first. */ - pan_cast_and_pack(&descs[0], SAMPLER, cfg) { + emit_varying_descs(cmdbuf, (struct mali_attribute_packed *)(&descs[0])); + + /* Dummy sampler always comes right after the varyings. */ + pan_cast_and_pack(&descs[MAX_VARYING], SAMPLER, cfg) { cfg.clamp_integer_array_indices = false; } - panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, fs, - (struct mali_buffer_packed *)(&descs[1])); + panvk_per_arch(cmd_fill_dyn_bufs)( + desc_state, fs, (struct mali_buffer_packed *)(&descs[1 + MAX_VARYING])); fs_desc_state->driver_set.dev_addr = driver_set.gpu; fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE; @@ -1650,16 +1719,8 @@ prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw) if (result != VK_SUCCESS) return result; - uint32_t varying_size = 0; - - if (fs) { - unsigned vs_vars = vs->info.varyings.output_count; - unsigned fs_vars = fs->info.varyings.input_count; - unsigned var_slots = MAX2(vs_vars, fs_vars); - - /* Assumes 16 byte slots. We could do better. */ - varying_size = var_slots * 16; - } + /* Assumes 16 byte slots. We could do better. */ + uint32_t varying_size = get_varying_slots(cmdbuf) * 16; cs_update_vt_ctx(b) { /* We don't use the resource dep system yet. */ diff --git a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c index 1f155864fc7..aa733635f0f 100644 --- a/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c +++ b/src/panfrost/vulkan/panvk_vX_nir_lower_descriptors.c @@ -1025,8 +1025,22 @@ create_copy_table(nir_shader *nir, struct lower_desc_ctx *ctx) for (uint32_t i = 0; i < PANVK_BIFROST_DESC_TABLE_COUNT; i++) copy_count += desc_info->others[i].count; #else - /* Dummy sampler comes after the vertex attributes. */ - uint32_t dummy_sampler_idx = nir->info.stage == MESA_SHADER_VERTEX ? 16 : 0; + uint32_t dummy_sampler_idx; + switch (nir->info.stage) { + case MESA_SHADER_VERTEX: + /* Dummy sampler comes after the vertex attributes. */ + dummy_sampler_idx = 16; + break; + case MESA_SHADER_FRAGMENT: + /* Dummy sampler comes after the varyings. */ + dummy_sampler_idx = MAX_VARYING; + break; + case MESA_SHADER_COMPUTE: + dummy_sampler_idx = 0; + break; + default: + unreachable("unexpected stage"); + } desc_info->dummy_sampler_handle = pan_res_handle(0, dummy_sampler_idx); copy_count = desc_info->dyn_bufs.count + desc_info->dyn_bufs.count; diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index 0007f799f06..ee5b96c918b 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -1041,6 +1041,10 @@ panvk_compile_shader(struct panvk_device *dev, .gpu_id = phys_dev->kmod.props.gpu_prod_id, .no_ubo_to_push = true, .view_mask = (state && state->rp) ? state->rp->view_mask : 0, +#if PAN_ARCH >= 9 + /* LD_VAR_BUF does not support maxVertexOutputComponents (128) */ + .valhall.use_ld_var_buf = false, +#endif }; if (info->stage == MESA_SHADER_FRAGMENT && state != NULL &&