From ca2042f3592e2c292e7c2423c628ac2728925f72 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 6 Feb 2023 17:23:19 -0500 Subject: [PATCH] panfrost: Preprocess shaders in the driver This is a flag-day change to how we compile. We split preprocessing NIR into a separate step from compiling, giving the driver a chance to apply its own lowerings on the preprocessed NIR before the final optimization loop. During that time, the different producers of NIR (panfrost, panvk, blend shaders, blit shaders...) will be able to (differently) lower system values. Signed-off-by: Alyssa Rosenzweig Reviewed-by: Boris Brezillon Part-of: --- src/gallium/drivers/panfrost/pan_shader.c | 17 ++++++++++ src/panfrost/compiler/bifrost_compile.c | 22 +++--------- src/panfrost/compiler/bifrost_compile.h | 3 +- src/panfrost/lib/pan_blend.c | 41 +++++++++++++++++++++++ src/panfrost/lib/pan_blend.h | 4 +++ src/panfrost/lib/pan_blitter.c | 2 ++ src/panfrost/lib/pan_indirect_dispatch.c | 1 + src/panfrost/lib/pan_shader.c | 29 +++++++++++----- src/panfrost/lib/pan_shader.h | 14 ++++++++ src/panfrost/midgard/midgard_compile.c | 19 +++-------- src/panfrost/midgard/midgard_compile.h | 3 +- src/panfrost/util/pan_ir.h | 2 -- src/panfrost/vulkan/panvk_vX_meta_clear.c | 1 + src/panfrost/vulkan/panvk_vX_meta_copy.c | 26 ++++++-------- src/panfrost/vulkan/panvk_vX_shader.c | 15 ++++++--- 15 files changed, 133 insertions(+), 66 deletions(-) diff --git a/src/gallium/drivers/panfrost/pan_shader.c b/src/gallium/drivers/panfrost/pan_shader.c index e3935651219..08b56762324 100644 --- a/src/gallium/drivers/panfrost/pan_shader.c +++ b/src/gallium/drivers/panfrost/pan_shader.c @@ -77,6 +77,15 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, nir_shader *s = nir_shader_clone(NULL, ir); + /* While graphics shaders are preprocessed at CSO create time, compute + * kernels are not preprocessed until they're cloned since the driver does + * not get ownership of the NIR from compute CSOs. Do this preprocessing now. + * Compute CSOs call this function during create time, so preprocessing + * happens at CSO create time regardless. + */ + if (gl_shader_stage_is_compute(s->info.stage)) + pan_shader_preprocess(s, dev->gpu_id); + struct panfrost_compile_inputs inputs = { .debug = dbg, .gpu_id = dev->gpu_id, @@ -109,6 +118,14 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir, } util_dynarray_init(&out->binary, NULL); + pan_shader_preprocess(s, inputs.gpu_id); + + if (dev->arch <= 5 && s->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(s, pan_lower_framebuffer, key->fs.rt_formats, + pan_raw_format_mask_midgard(key->fs.rt_formats), false, + dev->gpu_id < 0x700); + } + screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info); assert(req_local_mem >= out->info.wls_size); diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index e34172abacb..3edba4e94aa 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -4716,7 +4716,7 @@ bi_lower_sample_mask_writes(nir_builder *b, nir_instr *instr, void *data) } static bool -bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data) +bi_lower_load_output(nir_builder *b, nir_instr *instr, UNUSED void *data) { if (instr->type != nir_instr_type_intrinsic) return false; @@ -4734,15 +4734,6 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data) nir_ssa_def *conversion = nir_load_rt_conversion_pan( b, .base = rt, .src_type = nir_intrinsic_dest_type(intr)); - /* TODO: This should be optimized/lowered by the driver */ - const struct panfrost_compile_inputs *inputs = data; - - if (inputs->is_blend) { - conversion = nir_imm_int(b, inputs->blend.bifrost_blend_desc >> 32); - } else if (inputs->bifrost.static_rt_conv) { - conversion = nir_imm_int(b, inputs->bifrost.rt_conv[rt]); - } - nir_ssa_def *lowered = nir_load_converted_output_pan( b, nir_dest_num_components(intr->dest), nir_dest_bit_size(intr->dest), conversion, .dest_type = nir_intrinsic_dest_type(intr), @@ -4753,8 +4744,7 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data) } void -bifrost_preprocess_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs) +bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id) { /* Lower gl_Position pre-optimisation, but after lowering vars to ssa * (so we don't accidentally duplicate the epilogue since mesa/st has @@ -4781,7 +4771,7 @@ bifrost_preprocess_nir(nir_shader *nir, * (currently unconditional for Valhall), we force vec4 alignment for * scratch access. */ - bool packed_tls = (inputs->gpu_id >= 0x9000); + bool packed_tls = (gpu_id >= 0x9000); /* Lower large arrays to scratch and small arrays to bcsel */ NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256, @@ -4810,10 +4800,9 @@ bifrost_preprocess_nir(nir_shader *nir, nir_metadata_block_index | nir_metadata_dominance, NULL); NIR_PASS_V(nir, nir_shader_instructions_pass, bi_lower_load_output, - nir_metadata_block_index | nir_metadata_dominance, - (void *)inputs); + nir_metadata_block_index | nir_metadata_dominance, NULL); } else if (nir->info.stage == MESA_SHADER_VERTEX) { - if (inputs->gpu_id >= 0x9000) { + if (gpu_id >= 0x9000) { NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, BITFIELD64_BIT(VARYING_SLOT_PSIZ), false); } @@ -5251,7 +5240,6 @@ bifrost_compile_shader_nir(nir_shader *nir, { bifrost_debug = debug_get_option_bifrost_debug(); - bifrost_preprocess_nir(nir, inputs); bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend); struct hash_table_u64 *sysval_to_id = diff --git a/src/panfrost/compiler/bifrost_compile.h b/src/panfrost/compiler/bifrost_compile.h index a746703d372..dca1abade6b 100644 --- a/src/panfrost/compiler/bifrost_compile.h +++ b/src/panfrost/compiler/bifrost_compile.h @@ -28,8 +28,7 @@ #include "panfrost/util/pan_ir.h" #include "util/u_dynarray.h" -void bifrost_preprocess_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs); +void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id); void bifrost_compile_shader_nir(nir_shader *nir, const struct panfrost_compile_inputs *inputs, diff --git a/src/panfrost/lib/pan_blend.c b/src/panfrost/lib/pan_blend.c index 9967590e59a..e36506a90e9 100644 --- a/src/panfrost/lib/pan_blend.c +++ b/src/panfrost/lib/pan_blend.c @@ -763,6 +763,42 @@ GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev, return res; } + +struct rt_conversion_inputs { + const struct panfrost_device *dev; + enum pipe_format *formats; +}; + +static bool +inline_rt_conversion(nir_builder *b, nir_instr *instr, void *data) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_load_rt_conversion_pan) + return false; + + struct rt_conversion_inputs *inputs = data; + unsigned rt = nir_intrinsic_base(intr); + unsigned size = nir_alu_type_get_type_size(nir_intrinsic_src_type(intr)); + uint64_t conversion = GENX(pan_blend_get_internal_desc)( + inputs->dev, inputs->formats[rt], rt, size, false); + + b->cursor = nir_after_instr(instr); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_imm_int(b, conversion >> 32)); + return true; +} + +bool +GENX(pan_inline_rt_conversion)(nir_shader *s, const struct panfrost_device *dev, + enum pipe_format *formats) +{ + return nir_shader_instructions_pass( + s, inline_rt_conversion, + nir_metadata_block_index | nir_metadata_dominance, + &(struct rt_conversion_inputs){.dev = dev, .formats = formats}); +} #endif struct pan_blend_shader_variant * @@ -843,6 +879,11 @@ GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev, #endif struct pan_shader_info info; + pan_shader_preprocess(nir, inputs.gpu_id); + +#if PAN_ARCH >= 6 + NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), dev, inputs.rt_formats); +#endif GENX(pan_shader_compile)(nir, &inputs, &variant->binary, &info); diff --git a/src/panfrost/lib/pan_blend.h b/src/panfrost/lib/pan_blend.h index 8b826d41b42..914b9a1e1ea 100644 --- a/src/panfrost/lib/pan_blend.h +++ b/src/panfrost/lib/pan_blend.h @@ -161,6 +161,10 @@ nir_shader *GENX(pan_blend_create_shader)(const struct panfrost_device *dev, uint64_t GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev, enum pipe_format fmt, unsigned rt, unsigned force_size, bool dithered); + +bool GENX(pan_inline_rt_conversion)(nir_shader *s, + const struct panfrost_device *dev, + enum pipe_format *formats); #endif /* Take blend_shaders.lock before calling this function and release it when diff --git a/src/panfrost/lib/pan_blitter.c b/src/panfrost/lib/pan_blitter.c index 44e59af61f2..205b67eb9d0 100644 --- a/src/panfrost/lib/pan_blitter.c +++ b/src/panfrost/lib/pan_blitter.c @@ -631,6 +631,8 @@ pan_blitter_get_blit_shader(struct panfrost_device *dev, for (unsigned i = 0; i < active_count; ++i) BITSET_SET(b.shader->info.textures_used, i); + pan_shader_preprocess(b.shader, inputs.gpu_id); + if (PAN_ARCH == 4) { NIR_PASS_V(b.shader, nir_shader_instructions_pass, lower_sampler_parameters, diff --git a/src/panfrost/lib/pan_indirect_dispatch.c b/src/panfrost/lib/pan_indirect_dispatch.c index 8f7e75e50ba..019db47a980 100644 --- a/src/panfrost/lib/pan_indirect_dispatch.c +++ b/src/panfrost/lib/pan_indirect_dispatch.c @@ -130,6 +130,7 @@ pan_indirect_dispatch_init(struct panfrost_device *dev) struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info); ralloc_free(b.shader); diff --git a/src/panfrost/lib/pan_shader.c b/src/panfrost/lib/pan_shader.c index eb683de1f3e..afe1ccfae4f 100644 --- a/src/panfrost/lib/pan_shader.c +++ b/src/panfrost/lib/pan_shader.c @@ -83,6 +83,27 @@ GENX(pan_fixup_blend_type)(nir_alu_type T_size, enum pipe_format format) #endif #endif +/* This is only needed on Midgard. It's the same on both v4 and v5, so only + * compile once to avoid the GenXML dependency for calls. + */ +#if PAN_ARCH == 5 +uint8_t +pan_raw_format_mask_midgard(enum pipe_format *formats) +{ + uint8_t out = 0; + + for (unsigned i = 0; i < 8; i++) { + enum pipe_format fmt = formats[i]; + unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback; + + if (wb_fmt < MALI_COLOR_FORMAT_R8) + out |= BITFIELD_BIT(i); + } + + return out; +} +#endif + void GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs, struct util_dynarray *binary, @@ -93,14 +114,6 @@ GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs, #if PAN_ARCH >= 6 bifrost_compile_shader_nir(s, inputs, binary, info); #else - for (unsigned i = 0; i < ARRAY_SIZE(inputs->rt_formats); i++) { - enum pipe_format fmt = inputs->rt_formats[i]; - unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback; - - if (wb_fmt < MALI_COLOR_FORMAT_R8) - inputs->raw_fmt_mask |= BITFIELD_BIT(i); - } - midgard_compile_shader_nir(s, inputs, binary, info); #endif diff --git a/src/panfrost/lib/pan_shader.h b/src/panfrost/lib/pan_shader.h index 406db3d37ce..df955affaf6 100644 --- a/src/panfrost/lib/pan_shader.h +++ b/src/panfrost/lib/pan_shader.h @@ -34,6 +34,20 @@ struct panfrost_device; +void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id); +void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id); + +static inline void +pan_shader_preprocess(nir_shader *nir, unsigned gpu_id) +{ + if (pan_arch(gpu_id) >= 6) + bifrost_preprocess_nir(nir, gpu_id); + else + midgard_preprocess_nir(nir, gpu_id); +} + +uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats); + #ifdef PAN_ARCH const nir_shader_compiler_options *GENX(pan_shader_get_compiler_options)(void); diff --git a/src/panfrost/midgard/midgard_compile.c b/src/panfrost/midgard/midgard_compile.c index 35e40ac204f..2081c40f1f1 100644 --- a/src/panfrost/midgard/midgard_compile.c +++ b/src/panfrost/midgard/midgard_compile.c @@ -40,7 +40,6 @@ #include "util/u_dynarray.h" #include "util/u_math.h" -#include "panfrost/util/pan_lower_framebuffer.h" #include "compiler.h" #include "helpers.h" #include "midgard.h" @@ -330,10 +329,9 @@ midgard_vectorize_filter(const nir_instr *instr, const void *data) } void -midgard_preprocess_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs) +midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id) { - unsigned quirks = midgard_get_quirks(inputs->gpu_id); + unsigned quirks = midgard_get_quirks(gpu_id); /* Lower gl_Position pre-optimisation, but after lowering vars to ssa * (so we don't accidentally duplicate the epilogue since mesa/st has @@ -391,10 +389,9 @@ midgard_preprocess_nir(nir_shader *nir, NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); /* TEX_GRAD fails to apply sampler descriptor settings on some - * implementations, requiring a lowering. However, blit shaders do not - * use the affected settings and should skip the workaround. + * implementations, requiring a lowering. */ - if ((quirks & MIDGARD_BROKEN_LOD) && !inputs->is_blit) + if (quirks & MIDGARD_BROKEN_LOD) NIR_PASS_V(nir, midgard_nir_lod_errata); /* Midgard image ops coordinates are 16-bit instead of 32-bit */ @@ -417,12 +414,6 @@ midgard_preprocess_nir(nir_shader *nir, NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL); NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */); NIR_PASS_V(nir, nir_lower_var_copies); - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, pan_lower_framebuffer, inputs->rt_formats, - inputs->raw_fmt_mask, inputs->is_blend, - quirks & MIDGARD_BROKEN_BLEND_LOADS); - } } static void @@ -3177,8 +3168,6 @@ midgard_compile_shader_nir(nir_shader *nir, ctx->ssa_constants = _mesa_hash_table_u64_create(ctx); - midgard_preprocess_nir(nir, inputs); - /* Collect varyings after lowering I/O */ pan_nir_collect_varyings(nir, info); diff --git a/src/panfrost/midgard/midgard_compile.h b/src/panfrost/midgard/midgard_compile.h index 0ea6eadc304..d2a1cd03931 100644 --- a/src/panfrost/midgard/midgard_compile.h +++ b/src/panfrost/midgard/midgard_compile.h @@ -29,8 +29,7 @@ #include "panfrost/util/pan_ir.h" #include "util/u_dynarray.h" -void midgard_preprocess_nir(nir_shader *nir, - const struct panfrost_compile_inputs *inputs); +void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id); void midgard_compile_shader_nir(nir_shader *nir, const struct panfrost_compile_inputs *inputs, diff --git a/src/panfrost/util/pan_ir.h b/src/panfrost/util/pan_ir.h index 41fdcd15a36..97df6ebb843 100644 --- a/src/panfrost/util/pan_ir.h +++ b/src/panfrost/util/pan_ir.h @@ -184,7 +184,6 @@ struct panfrost_compile_inputs { bool no_ubo_to_push; enum pipe_format rt_formats[8]; - uint8_t raw_fmt_mask; /* Used on Valhall. * @@ -198,7 +197,6 @@ struct panfrost_compile_inputs { union { struct { - bool static_rt_conv; uint32_t rt_conv[8]; } bifrost; }; diff --git a/src/panfrost/vulkan/panvk_vX_meta_clear.c b/src/panfrost/vulkan/panvk_vX_meta_clear.c index cc2874823e9..32f8e64d2a3 100644 --- a/src/panfrost/vulkan/panvk_vX_meta_clear.c +++ b/src/panfrost/vulkan/panvk_vX_meta_clear.c @@ -61,6 +61,7 @@ panvk_meta_clear_color_attachment_shader(struct panfrost_device *pdev, struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = 4; diff --git a/src/panfrost/vulkan/panvk_vX_meta_copy.c b/src/panfrost/vulkan/panvk_vX_meta_copy.c index 3bb2218df87..3aec1f92654 100644 --- a/src/panfrost/vulkan/panvk_vX_meta_copy.c +++ b/src/panfrost/vulkan/panvk_vX_meta_copy.c @@ -449,17 +449,11 @@ panvk_meta_copy_img2img_shader(struct panfrost_device *pdev, .no_ubo_to_push = true, }; - pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) { - cfg.memory_format = (dstcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12; - cfg.register_format = dstcompsz == 2 ? - MALI_REGISTER_FILE_FORMAT_U16 : - MALI_REGISTER_FILE_FORMAT_U32; - } - inputs.bifrost.static_rt_conv = true; - struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); + NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, &dstfmt); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->fs.sample_shading = is_ms; @@ -984,17 +978,14 @@ panvk_meta_copy_buf2img_shader(struct panfrost_device *pdev, .no_ubo_to_push = true, }; - pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) { - cfg.memory_format = (imgcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12; - cfg.register_format = imgcompsz == 2 ? - MALI_REGISTER_FILE_FORMAT_U16 : - MALI_REGISTER_FILE_FORMAT_U32; - } - inputs.bifrost.static_rt_conv = true; - struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); + + enum pipe_format rt_formats[8] = {key.imgfmt}; + NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, rt_formats); + GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2img_info), 4); @@ -1434,6 +1425,7 @@ panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev, struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_img2buf_info), 4); @@ -1662,6 +1654,7 @@ panvk_meta_copy_buf2buf_shader(struct panfrost_device *pdev, struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2buf_info), 4); @@ -1791,6 +1784,7 @@ panvk_meta_fill_buf_shader(struct panfrost_device *pdev, struct util_dynarray binary; util_dynarray_init(&binary, NULL); + pan_shader_preprocess(b.shader, inputs.gpu_id); GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info); shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_fill_buf_info), 4); diff --git a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c index fde8445ca98..6061cd1ed2a 100644 --- a/src/panfrost/vulkan/panvk_vX_shader.c +++ b/src/panfrost/vulkan/panvk_vX_shader.c @@ -153,10 +153,6 @@ panvk_lower_blend(struct panfrost_device *pdev, rt_state->equation.alpha_dst_factor = BLEND_FACTOR_ZERO; rt_state->equation.alpha_invert_dst_factor = false; lower_blend = true; - - inputs->bifrost.static_rt_conv = true; - inputs->bifrost.rt_conv[rt] = - GENX(pan_blend_get_internal_desc)(pdev, fmt, rt, 32, false) >> 32; } if (lower_blend) { @@ -371,6 +367,17 @@ panvk_per_arch(shader_create)(struct panvk_device *dev, nir_print_shader(nir, stderr); } + pan_shader_preprocess(nir, inputs.gpu_id); + + if (stage == MESA_SHADER_FRAGMENT) { + enum pipe_format rt_formats[MAX_RTS] = {PIPE_FORMAT_NONE}; + + for (unsigned rt = 0; rt < MAX_RTS; ++rt) + rt_formats[rt] = blend_state->rts[rt].format; + + NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), pdev, rt_formats); + } + GENX(pan_shader_compile)(nir, &inputs, &shader->binary, &shader->info); /* System values shouldn't have changed */