diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index e3935651219..08b56762324 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -77,6 +77,15 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, nir_shader *s = nir_shader_clone(NULL, ir); + /* While graphics shaders are preprocessed at CSO create time, compute + * kernels are not preprocessed until they're cloned since the driver does + * not get ownership of the NIR from compute CSOs. Do this preprocessing now. + * Compute CSOs call this function during create time, so preprocessing + * happens at CSO create time regardless. + */ + if (gl_shader_stage_is_compute(s->info.stage)) + pan_shader_preprocess(s, dev->gpu_id); + struct panfrost_compile_inputs inputs = { .debug = dbg, .gpu_id = dev->gpu_id, @@ -109,6 +118,14 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, } util_dynarray_init(&out->binary, NULL); + pan_shader_preprocess(s, inputs.gpu_id); + + if (dev->arch <= 5 && s->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(s, pan_lower_framebuffer, key->fs.rt_formats, + pan_raw_format_mask_midgard(key->fs.rt_formats), false, + dev->gpu_id < 0x700); + } + screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info); assert(req_local_mem >= out->info.wls_size); diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index e34172abacb..3edba4e94aa 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -4716,7 +4716,7 @@ bi_lower_sample_mask_writes(nir_builder *b, nir_instr *instr, void *data) } static bool -bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data) +bi_lower_load_output(nir_builder *b, nir_instr *instr, UNUSED void *data) { if (instr->type != nir_instr_type_intrinsic) return false; @@ -4734,15 +4734,6 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data) nir_ssa_def *conversion = nir_load_rt_conversion_pan( b, .base = rt, .src_type = nir_intrinsic_dest_type(intr)); - /* TODO: This should be optimized/lowered by the driver */ - const struct panfrost_compile_inputs *inputs = data; - - if (inputs->is_blend) { - conversion = nir_imm_int(b, inputs->blend.bifrost_blend_desc >> 32); - } else if (inputs->bifrost.static_rt_conv) { - conversion = nir_imm_int(b, inputs->bifrost.rt_conv[rt]); - } - nir_ssa_def *lowered = nir_load_converted_output_pan( b, nir_dest_num_components(intr->dest), nir_dest_bit_size(intr->dest), conversion, .dest_type = nir_intrinsic_dest_type(intr), @@ -4753,8 +4744,7 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data) } void -bifrost_preprocess_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs) +bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id) { /* Lower gl_Position pre-optimisation, but after lowering vars to ssa * (so we don't accidentally duplicate the epilogue since mesa/st has @@ -4781,7 +4771,7 @@ bifrost_preprocess_nir(nir_shader *nir, * (currently unconditional for Valhall), we force vec4 alignment for * scratch access. */ - bool packed_tls = (inputs->gpu_id >= 0x9000); + bool packed_tls = (gpu_id >= 0x9000); /* Lower large arrays to scratch and small arrays to bcsel */ NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256, @@ -4810,10 +4800,9 @@ bifrost_preprocess_nir(nir_shader *nir, nir_metadata_block_index | nir_metadata_dominance, NULL); NIR_PASS_V(nir, nir_shader_instructions_pass, bi_lower_load_output, - nir_metadata_block_index | nir_metadata_dominance, - (void *)inputs); + nir_metadata_block_index | nir_metadata_dominance, NULL); } else if (nir->info.stage == MESA_SHADER_VERTEX) { - if (inputs->gpu_id >= 0x9000) { + if (gpu_id >= 0x9000) { NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, BITFIELD64_BIT(VARYING_SLOT_PSIZ), false); } @@ -5251,7 +5240,6 @@ bifrost_compile_shader_nir(nir_shader *nir, { bifrost_debug = debug_get_option_bifrost_debug(); - bifrost_preprocess_nir(nir, inputs); bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend); struct hash_table_u64 *sysval_to_id = diff --git a/src/panfrost/compiler/bifrost_compile.h b/src/panfrost/compiler/bifrost_compile.h index a746703d372..dca1abade6b 100644 --- a/src/panfrost/compiler/bifrost_compile.h +++ b/src/panfrost/compiler/bifrost_compile.h @@ -28,8 +28,7 @@ #include "panfrost/util/pan_ir.h" #include "util/u_dynarray.h" -void bifrost_preprocess_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs); +void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id); void bifrost_compile_shader_nir(nir_shader *nir, const struct panfrost_compile_inputs *inputs, diff --git a/src/panfrost/lib/pan_blend.c b/src/panfrost/lib/pan_blend.c index 9967590e59a..e36506a90e9 100644 --- a/src/panfrost/lib/pan_blend.c +++ b/src/panfrost/lib/pan_blend.c @@ -763,6 +763,42 @@ GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev, return res; } + +struct rt_conversion_inputs { + const struct panfrost_device *dev; + enum pipe_format *formats; +}; + +static bool +inline_rt_conversion(nir_builder *b, nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_load_rt_conversion_pan) + return false; + + struct rt_conversion_inputs *inputs = data; + unsigned rt = nir_intrinsic_base(intr); + unsigned size = nir_alu_type_get_type_size(nir_intrinsic_src_type(intr)); + uint64_t conversion = GENX(pan_blend_get_internal_desc)( + inputs->dev, inputs->formats[rt], rt, size, false); + + b->cursor = nir_after_instr(instr); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_imm_int(b, conversion >> 32)); + return true; +} + +bool +GENX(pan_inline_rt_conversion)(nir_shader *s, const struct panfrost_device *dev, + enum pipe_format *formats) +{ + return nir_shader_instructions_pass( + s, inline_rt_conversion, + nir_metadata_block_index | nir_metadata_dominance, + &(struct rt_conversion_inputs){.dev = dev, .formats = formats}); +} #endif struct pan_blend_shader_variant * @@ -843,6 +879,11 @@ GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev, #endif struct pan_shader_info info; + pan_shader_preprocess(nir, inputs.gpu_id); + +#if PAN_ARCH >= 6 + NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), dev, inputs.rt_formats); +#endif GENX(pan_shader_compile)(nir, &inputs, &variant->binary, &info); diff --git a/src/panfrost/lib/pan_blend.h b/src/panfrost/lib/pan_blend.h index 8b826d41b42..914b9a1e1ea 100644 --- a/src/panfrost/lib/pan_blend.h +++ b/src/panfrost/lib/pan_blend.h @@ -161,6 +161,10 @@ nir_shader *GENX(pan_blend_create_shader)(const struct panfrost_device *dev, uint64_t GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev, enum pipe_format fmt, unsigned rt, unsigned force_size, bool dithered); + +bool GENX(pan_inline_rt_conversion)(nir_shader *s, + const struct panfrost_device *dev, + enum pipe_format *formats); #endif /* Take blend_shaders.lock before calling this function and release it when diff --git a/src/panfrost/lib/pan_blitter.c b/src/panfrost/lib/pan_blitter.c index 44e59af61f2..205b67eb9d0 100644 --- a/src/panfrost/lib/pan_blitter.c +++ b/src/panfrost/lib/pan_blitter.c @@ -631,6 +631,8 @@ pan_blitter_get_blit_shader(struct panfrost_device *dev, for (unsigned i = 0; i < active_count; ++i) BITSET_SET(b.shader->info.textures_used, i); + pan_shader_preprocess(b.shader, inputs.gpu_id); + if (PAN_ARCH == 4) { NIR_PASS_V(b.shader, nir_shader_instructions_pass, lower_sampler_parameters, diff --git a/src/panfrost/lib/pan_indirect_dispatch.c b/src/panfrost/lib/pan_indirect_dispatch.c index 8f7e75e50ba..019db47a980 100644 --- a/src/panfrost/lib/pan_indirect_dispatch.c +++ b/src/panfrost/lib/pan_indirect_dispatch.c @@ -130,6 +130,7 @@ pan_indirect_dispatch_init(struct panfrost_device *dev) struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info); ralloc_free(b.shader); diff --git a/src/panfrost/lib/pan_shader.c b/src/panfrost/lib/pan_shader.c index eb683de1f3e..afe1ccfae4f 100644 --- a/src/panfrost/lib/pan_shader.c +++ b/src/panfrost/lib/pan_shader.c @@ -83,6 +83,27 @@ GENX(pan_fixup_blend_type)(nir_alu_type T_size, enum pipe_format format) #endif #endif +/* This is only needed on Midgard. It's the same on both v4 and v5, so only + * compile once to avoid the GenXML dependency for calls. + */ +#if PAN_ARCH == 5 +uint8_t +pan_raw_format_mask_midgard(enum pipe_format *formats) +{ + uint8_t out = 0; + + for (unsigned i = 0; i < 8; i++) { + enum pipe_format fmt = formats[i]; + unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback; + + if (wb_fmt < MALI_COLOR_FORMAT_R8) + out |= BITFIELD_BIT(i); + } + + return out; +} +#endif + void GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs, struct util_dynarray *binary, @@ -93,14 +114,6 @@ GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs, #if PAN_ARCH >= 6 bifrost_compile_shader_nir(s, inputs, binary, info); #else - for (unsigned i = 0; i < ARRAY_SIZE(inputs->rt_formats); i++) { - enum pipe_format fmt = inputs->rt_formats[i]; - unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback; - - if (wb_fmt < MALI_COLOR_FORMAT_R8) - inputs->raw_fmt_mask |= BITFIELD_BIT(i); - } - midgard_compile_shader_nir(s, inputs, binary, info); #endif diff --git a/src/panfrost/lib/pan_shader.h b/src/panfrost/lib/pan_shader.h index 406db3d37ce..df955affaf6 100644 --- a/src/panfrost/lib/pan_shader.h +++ b/src/panfrost/lib/pan_shader.h @@ -34,6 +34,20 @@ struct panfrost_device; +void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id); +void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id); + +static inline void +pan_shader_preprocess(nir_shader *nir, unsigned gpu_id) +{ + if (pan_arch(gpu_id) >= 6) + bifrost_preprocess_nir(nir, gpu_id); + else + midgard_preprocess_nir(nir, gpu_id); +} + +uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats); + #ifdef PAN_ARCH const nir_shader_compiler_options *GENX(pan_shader_get_compiler_options)(void); diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index 35e40ac204f..2081c40f1f1 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -40,7 +40,6 @@ #include "util/u_dynarray.h" #include "util/u_math.h" -#include "panfrost/util/pan_lower_framebuffer.h" #include "compiler.h" #include "helpers.h" #include "midgard.h" @@ -330,10 +329,9 @@ midgard_vectorize_filter(const nir_instr *instr, const void *data) } void -midgard_preprocess_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs) +midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id) { - unsigned quirks = midgard_get_quirks(inputs->gpu_id); + unsigned quirks = midgard_get_quirks(gpu_id); /* Lower gl_Position pre-optimisation, but after lowering vars to ssa * (so we don't accidentally duplicate the epilogue since mesa/st has @@ -391,10 +389,9 @@ midgard_preprocess_nir(nir_shader *nir, NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); /* TEX_GRAD fails to apply sampler descriptor settings on some - * implementations, requiring a lowering. However, blit shaders do not - * use the affected settings and should skip the workaround. + * implementations, requiring a lowering. */ - if ((quirks & MIDGARD_BROKEN_LOD) && !inputs->is_blit) + if (quirks & MIDGARD_BROKEN_LOD) NIR_PASS_V(nir, midgard_nir_lod_errata); /* Midgard image ops coordinates are 16-bit instead of 32-bit */ @@ -417,12 +414,6 @@ midgard_preprocess_nir(nir_shader *nir, NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL); NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */); NIR_PASS_V(nir, nir_lower_var_copies); - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, pan_lower_framebuffer, inputs->rt_formats, - inputs->raw_fmt_mask, inputs->is_blend, - quirks & MIDGARD_BROKEN_BLEND_LOADS); - } } static void @@ -3177,8 +3168,6 @@ midgard_compile_shader_nir(nir_shader *nir, ctx->ssa_constants = _mesa_hash_table_u64_create(ctx); - midgard_preprocess_nir(nir, inputs); - /* Collect varyings after lowering I/O */ pan_nir_collect_varyings(nir, info); diff --git a/src/panfrost/midgard/midgard_compile.h b/src/panfrost/midgard/midgard_compile.h index 0ea6eadc304..d2a1cd03931 100644 --- a/src/panfrost/midgard/midgard_compile.h +++ b/src/panfrost/midgard/midgard_compile.h @@ -29,8 +29,7 @@ #include "panfrost/util/pan_ir.h" #include "util/u_dynarray.h" -void midgard_preprocess_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs); +void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id); void midgard_compile_shader_nir(nir_shader *nir, const struct panfrost_compile_inputs *inputs, diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 41fdcd15a36..97df6ebb843 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -184,7 +184,6 @@ struct panfrost_compile_inputs { bool no_ubo_to_push; enum pipe_format rt_formats[8]; - uint8_t raw_fmt_mask; /* Used on Valhall. * @@ -198,7 +197,6 @@ struct panfrost_compile_inputs { union { struct { - bool static_rt_conv; uint32_t rt_conv[8]; } bifrost; }; diff --git a/src/panfrost/vulkan/panvk_vX_meta_clear.c b/src/panfrost/vulkan/panvk_vX_meta_clear.c index cc2874823e9..32f8e64d2a3 100644 --- a/src/panfrost/vulkan/panvk_vX_meta_clear.c +++ b/src/panfrost/vulkan/panvk_vX_meta_clear.c @@ -61,6 +61,7 @@ panvk_meta_clear_color_attachment_shader(struct panfrost_device *pdev, struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = 4; diff --git a/src/panfrost/vulkan/panvk_vX_meta_copy.c b/src/panfrost/vulkan/panvk_vX_meta_copy.c index 3bb2218df87..3aec1f92654 100644 --- a/src/panfrost/vulkan/panvk_vX_meta_copy.c +++ b/src/panfrost/vulkan/panvk_vX_meta_copy.c @@ -449,17 +449,11 @@ panvk_meta_copy_img2img_shader(struct panfrost_device *pdev, .no_ubo_to_push = true, }; - pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) { - cfg.memory_format = (dstcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12; - cfg.register_format = dstcompsz == 2 ? - MALI_REGISTER_FILE_FORMAT_U16 : - MALI_REGISTER_FILE_FORMAT_U32; - } - inputs.bifrost.static_rt_conv = true; - struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); + NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, &dstfmt); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->fs.sample_shading = is_ms; @@ -984,17 +978,14 @@ panvk_meta_copy_buf2img_shader(struct panfrost_device *pdev, .no_ubo_to_push = true, }; - pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) { - cfg.memory_format = (imgcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12; - cfg.register_format = imgcompsz == 2 ? - MALI_REGISTER_FILE_FORMAT_U16 : - MALI_REGISTER_FILE_FORMAT_U32; - } - inputs.bifrost.static_rt_conv = true; - struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); + + enum pipe_format rt_formats[8] = {key.imgfmt}; + NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, rt_formats); + GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2img_info), 4); @@ -1434,6 +1425,7 @@ panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev, struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_img2buf_info), 4); @@ -1662,6 +1654,7 @@ panvk_meta_copy_buf2buf_shader(struct panfrost_device *pdev, struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2buf_info), 4); @@ -1791,6 +1784,7 @@ panvk_meta_fill_buf_shader(struct panfrost_device *pdev, struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_fill_buf_info), 4); diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index fde8445ca98..6061cd1ed2a 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -153,10 +153,6 @@ panvk_lower_blend(struct panfrost_device *pdev, rt_state->equation.alpha_dst_factor = BLEND_FACTOR_ZERO; rt_state->equation.alpha_invert_dst_factor = false; lower_blend = true; - - inputs->bifrost.static_rt_conv = true; - inputs->bifrost.rt_conv[rt] = - GENX(pan_blend_get_internal_desc)(pdev, fmt, rt, 32, false) >> 32; } if (lower_blend) { @@ -371,6 +367,17 @@ panvk_per_arch(shader_create)(struct panvk_device *dev, nir_print_shader(nir, stderr); } + pan_shader_preprocess(nir, inputs.gpu_id); + + if (stage == MESA_SHADER_FRAGMENT) { + enum pipe_format rt_formats[MAX_RTS] = {PIPE_FORMAT_NONE}; + + for (unsigned rt = 0; rt < MAX_RTS; ++rt) + rt_formats[rt] = blend_state->rts[rt].format; + + NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), pdev, rt_formats); + } + GENX(pan_shader_compile)(nir, &inputs, &shader->binary, &shader->info); /* System values shouldn't have changed */