panfrost: Preprocess shaders in the driver

This is a flag-day change to how we compile. We split preprocessing NIR into a
separate step from compiling, giving the driver a chance to apply its own
lowerings on the preprocessed NIR before the final optimization loop. During
that time, the different producers of NIR (panfrost, panvk, blend shaders, blit
shaders...) will be able to (differently) lower system values.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20906>
This commit is contained in:
Alyssa Rosenzweig 2023-02-06 17:23:19 -05:00 committed by Marge Bot
parent 2a356cefba
commit ca2042f359
15 changed files with 133 additions and 66 deletions

View file

@ -77,6 +77,15 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
nir_shader *s = nir_shader_clone(NULL, ir);
/* While graphics shaders are preprocessed at CSO create time, compute
* kernels are not preprocessed until they're cloned since the driver does
* not get ownership of the NIR from compute CSOs. Do this preprocessing now.
* Compute CSOs call this function during create time, so preprocessing
* happens at CSO create time regardless.
*/
if (gl_shader_stage_is_compute(s->info.stage))
pan_shader_preprocess(s, dev->gpu_id);
struct panfrost_compile_inputs inputs = {
.debug = dbg,
.gpu_id = dev->gpu_id,
@ -109,6 +118,14 @@ panfrost_shader_compile(struct panfrost_screen *screen, const nir_shader *ir,
}
util_dynarray_init(&out->binary, NULL);
pan_shader_preprocess(s, inputs.gpu_id);
if (dev->arch <= 5 && s->info.stage == MESA_SHADER_FRAGMENT) {
NIR_PASS_V(s, pan_lower_framebuffer, key->fs.rt_formats,
pan_raw_format_mask_midgard(key->fs.rt_formats), false,
dev->gpu_id < 0x700);
}
screen->vtbl.compile_shader(s, &inputs, &out->binary, &out->info);
assert(req_local_mem >= out->info.wls_size);

View file

@ -4716,7 +4716,7 @@ bi_lower_sample_mask_writes(nir_builder *b, nir_instr *instr, void *data)
}
static bool
bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data)
bi_lower_load_output(nir_builder *b, nir_instr *instr, UNUSED void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
@ -4734,15 +4734,6 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data)
nir_ssa_def *conversion = nir_load_rt_conversion_pan(
b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));
/* TODO: This should be optimized/lowered by the driver */
const struct panfrost_compile_inputs *inputs = data;
if (inputs->is_blend) {
conversion = nir_imm_int(b, inputs->blend.bifrost_blend_desc >> 32);
} else if (inputs->bifrost.static_rt_conv) {
conversion = nir_imm_int(b, inputs->bifrost.rt_conv[rt]);
}
nir_ssa_def *lowered = nir_load_converted_output_pan(
b, nir_dest_num_components(intr->dest), nir_dest_bit_size(intr->dest),
conversion, .dest_type = nir_intrinsic_dest_type(intr),
@ -4753,8 +4744,7 @@ bi_lower_load_output(nir_builder *b, nir_instr *instr, void *data)
}
void
bifrost_preprocess_nir(nir_shader *nir,
const struct panfrost_compile_inputs *inputs)
bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id)
{
/* Lower gl_Position pre-optimisation, but after lowering vars to ssa
* (so we don't accidentally duplicate the epilogue since mesa/st has
@ -4781,7 +4771,7 @@ bifrost_preprocess_nir(nir_shader *nir,
* (currently unconditional for Valhall), we force vec4 alignment for
* scratch access.
*/
bool packed_tls = (inputs->gpu_id >= 0x9000);
bool packed_tls = (gpu_id >= 0x9000);
/* Lower large arrays to scratch and small arrays to bcsel */
NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 256,
@ -4810,10 +4800,9 @@ bifrost_preprocess_nir(nir_shader *nir,
nir_metadata_block_index | nir_metadata_dominance, NULL);
NIR_PASS_V(nir, nir_shader_instructions_pass, bi_lower_load_output,
nir_metadata_block_index | nir_metadata_dominance,
(void *)inputs);
nir_metadata_block_index | nir_metadata_dominance, NULL);
} else if (nir->info.stage == MESA_SHADER_VERTEX) {
if (inputs->gpu_id >= 0x9000) {
if (gpu_id >= 0x9000) {
NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out,
BITFIELD64_BIT(VARYING_SLOT_PSIZ), false);
}
@ -5251,7 +5240,6 @@ bifrost_compile_shader_nir(nir_shader *nir,
{
bifrost_debug = debug_get_option_bifrost_debug();
bifrost_preprocess_nir(nir, inputs);
bi_optimize_nir(nir, inputs->gpu_id, inputs->is_blend);
struct hash_table_u64 *sysval_to_id =

View file

@ -28,8 +28,7 @@
#include "panfrost/util/pan_ir.h"
#include "util/u_dynarray.h"
void bifrost_preprocess_nir(nir_shader *nir,
const struct panfrost_compile_inputs *inputs);
void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id);
void bifrost_compile_shader_nir(nir_shader *nir,
const struct panfrost_compile_inputs *inputs,

View file

@ -763,6 +763,42 @@ GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev,
return res;
}
struct rt_conversion_inputs {
const struct panfrost_device *dev;
enum pipe_format *formats;
};
static bool
inline_rt_conversion(nir_builder *b, nir_instr *instr, void *data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic != nir_intrinsic_load_rt_conversion_pan)
return false;
struct rt_conversion_inputs *inputs = data;
unsigned rt = nir_intrinsic_base(intr);
unsigned size = nir_alu_type_get_type_size(nir_intrinsic_src_type(intr));
uint64_t conversion = GENX(pan_blend_get_internal_desc)(
inputs->dev, inputs->formats[rt], rt, size, false);
b->cursor = nir_after_instr(instr);
nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_imm_int(b, conversion >> 32));
return true;
}
bool
GENX(pan_inline_rt_conversion)(nir_shader *s, const struct panfrost_device *dev,
enum pipe_format *formats)
{
return nir_shader_instructions_pass(
s, inline_rt_conversion,
nir_metadata_block_index | nir_metadata_dominance,
&(struct rt_conversion_inputs){.dev = dev, .formats = formats});
}
#endif
struct pan_blend_shader_variant *
@ -843,6 +879,11 @@ GENX(pan_blend_get_shader_locked)(const struct panfrost_device *dev,
#endif
struct pan_shader_info info;
pan_shader_preprocess(nir, inputs.gpu_id);
#if PAN_ARCH >= 6
NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), dev, inputs.rt_formats);
#endif
GENX(pan_shader_compile)(nir, &inputs, &variant->binary, &info);

View file

@ -161,6 +161,10 @@ nir_shader *GENX(pan_blend_create_shader)(const struct panfrost_device *dev,
uint64_t GENX(pan_blend_get_internal_desc)(const struct panfrost_device *dev,
enum pipe_format fmt, unsigned rt,
unsigned force_size, bool dithered);
bool GENX(pan_inline_rt_conversion)(nir_shader *s,
const struct panfrost_device *dev,
enum pipe_format *formats);
#endif
/* Take blend_shaders.lock before calling this function and release it when

View file

@ -631,6 +631,8 @@ pan_blitter_get_blit_shader(struct panfrost_device *dev,
for (unsigned i = 0; i < active_count; ++i)
BITSET_SET(b.shader->info.textures_used, i);
pan_shader_preprocess(b.shader, inputs.gpu_id);
if (PAN_ARCH == 4) {
NIR_PASS_V(b.shader, nir_shader_instructions_pass,
lower_sampler_parameters,

View file

@ -130,6 +130,7 @@ pan_indirect_dispatch_init(struct panfrost_device *dev)
struct util_dynarray binary;
util_dynarray_init(&binary, NULL);
pan_shader_preprocess(b.shader, inputs.gpu_id);
GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info);
ralloc_free(b.shader);

View file

@ -83,6 +83,27 @@ GENX(pan_fixup_blend_type)(nir_alu_type T_size, enum pipe_format format)
#endif
#endif
/* This is only needed on Midgard. It's the same on both v4 and v5, so only
* compile once to avoid the GenXML dependency for calls.
*/
#if PAN_ARCH == 5
uint8_t
pan_raw_format_mask_midgard(enum pipe_format *formats)
{
uint8_t out = 0;
for (unsigned i = 0; i < 8; i++) {
enum pipe_format fmt = formats[i];
unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback;
if (wb_fmt < MALI_COLOR_FORMAT_R8)
out |= BITFIELD_BIT(i);
}
return out;
}
#endif
void
GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs,
struct util_dynarray *binary,
@ -93,14 +114,6 @@ GENX(pan_shader_compile)(nir_shader *s, struct panfrost_compile_inputs *inputs,
#if PAN_ARCH >= 6
bifrost_compile_shader_nir(s, inputs, binary, info);
#else
for (unsigned i = 0; i < ARRAY_SIZE(inputs->rt_formats); i++) {
enum pipe_format fmt = inputs->rt_formats[i];
unsigned wb_fmt = panfrost_blendable_formats_v6[fmt].writeback;
if (wb_fmt < MALI_COLOR_FORMAT_R8)
inputs->raw_fmt_mask |= BITFIELD_BIT(i);
}
midgard_compile_shader_nir(s, inputs, binary, info);
#endif

View file

@ -34,6 +34,20 @@
struct panfrost_device;
void bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id);
void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id);
static inline void
pan_shader_preprocess(nir_shader *nir, unsigned gpu_id)
{
if (pan_arch(gpu_id) >= 6)
bifrost_preprocess_nir(nir, gpu_id);
else
midgard_preprocess_nir(nir, gpu_id);
}
uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats);
#ifdef PAN_ARCH
const nir_shader_compiler_options *GENX(pan_shader_get_compiler_options)(void);

View file

@ -40,7 +40,6 @@
#include "util/u_dynarray.h"
#include "util/u_math.h"
#include "panfrost/util/pan_lower_framebuffer.h"
#include "compiler.h"
#include "helpers.h"
#include "midgard.h"
@ -330,10 +329,9 @@ midgard_vectorize_filter(const nir_instr *instr, const void *data)
}
void
midgard_preprocess_nir(nir_shader *nir,
const struct panfrost_compile_inputs *inputs)
midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id)
{
unsigned quirks = midgard_get_quirks(inputs->gpu_id);
unsigned quirks = midgard_get_quirks(gpu_id);
/* Lower gl_Position pre-optimisation, but after lowering vars to ssa
* (so we don't accidentally duplicate the epilogue since mesa/st has
@ -391,10 +389,9 @@ midgard_preprocess_nir(nir_shader *nir,
NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
/* TEX_GRAD fails to apply sampler descriptor settings on some
* implementations, requiring a lowering. However, blit shaders do not
* use the affected settings and should skip the workaround.
* implementations, requiring a lowering.
*/
if ((quirks & MIDGARD_BROKEN_LOD) && !inputs->is_blit)
if (quirks & MIDGARD_BROKEN_LOD)
NIR_PASS_V(nir, midgard_nir_lod_errata);
/* Midgard image ops coordinates are 16-bit instead of 32-bit */
@ -417,12 +414,6 @@ midgard_preprocess_nir(nir_shader *nir,
NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_should_scalarize, NULL);
NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */);
NIR_PASS_V(nir, nir_lower_var_copies);
if (nir->info.stage == MESA_SHADER_FRAGMENT) {
NIR_PASS_V(nir, pan_lower_framebuffer, inputs->rt_formats,
inputs->raw_fmt_mask, inputs->is_blend,
quirks & MIDGARD_BROKEN_BLEND_LOADS);
}
}
static void
@ -3177,8 +3168,6 @@ midgard_compile_shader_nir(nir_shader *nir,
ctx->ssa_constants = _mesa_hash_table_u64_create(ctx);
midgard_preprocess_nir(nir, inputs);
/* Collect varyings after lowering I/O */
pan_nir_collect_varyings(nir, info);

View file

@ -29,8 +29,7 @@
#include "panfrost/util/pan_ir.h"
#include "util/u_dynarray.h"
void midgard_preprocess_nir(nir_shader *nir,
const struct panfrost_compile_inputs *inputs);
void midgard_preprocess_nir(nir_shader *nir, unsigned gpu_id);
void midgard_compile_shader_nir(nir_shader *nir,
const struct panfrost_compile_inputs *inputs,

View file

@ -184,7 +184,6 @@ struct panfrost_compile_inputs {
bool no_ubo_to_push;
enum pipe_format rt_formats[8];
uint8_t raw_fmt_mask;
/* Used on Valhall.
*
@ -198,7 +197,6 @@ struct panfrost_compile_inputs {
union {
struct {
bool static_rt_conv;
uint32_t rt_conv[8];
} bifrost;
};

View file

@ -61,6 +61,7 @@ panvk_meta_clear_color_attachment_shader(struct panfrost_device *pdev,
struct util_dynarray binary;
util_dynarray_init(&binary, NULL);
pan_shader_preprocess(b.shader, inputs.gpu_id);
GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
shader_info->push.count = 4;

View file

@ -449,17 +449,11 @@ panvk_meta_copy_img2img_shader(struct panfrost_device *pdev,
.no_ubo_to_push = true,
};
pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
cfg.memory_format = (dstcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
cfg.register_format = dstcompsz == 2 ?
MALI_REGISTER_FILE_FORMAT_U16 :
MALI_REGISTER_FILE_FORMAT_U32;
}
inputs.bifrost.static_rt_conv = true;
struct util_dynarray binary;
util_dynarray_init(&binary, NULL);
pan_shader_preprocess(b.shader, inputs.gpu_id);
NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, &dstfmt);
GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
shader_info->fs.sample_shading = is_ms;
@ -984,17 +978,14 @@ panvk_meta_copy_buf2img_shader(struct panfrost_device *pdev,
.no_ubo_to_push = true,
};
pan_pack(&inputs.bifrost.rt_conv[0], INTERNAL_CONVERSION, cfg) {
cfg.memory_format = (imgcompsz == 2 ? MALI_RG16UI : MALI_RG32UI) << 12;
cfg.register_format = imgcompsz == 2 ?
MALI_REGISTER_FILE_FORMAT_U16 :
MALI_REGISTER_FILE_FORMAT_U32;
}
inputs.bifrost.static_rt_conv = true;
struct util_dynarray binary;
util_dynarray_init(&binary, NULL);
pan_shader_preprocess(b.shader, inputs.gpu_id);
enum pipe_format rt_formats[8] = {key.imgfmt};
NIR_PASS_V(b.shader, GENX(pan_inline_rt_conversion), pdev, rt_formats);
GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2img_info), 4);
@ -1434,6 +1425,7 @@ panvk_meta_copy_img2buf_shader(struct panfrost_device *pdev,
struct util_dynarray binary;
util_dynarray_init(&binary, NULL);
pan_shader_preprocess(b.shader, inputs.gpu_id);
GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_img2buf_info), 4);
@ -1662,6 +1654,7 @@ panvk_meta_copy_buf2buf_shader(struct panfrost_device *pdev,
struct util_dynarray binary;
util_dynarray_init(&binary, NULL);
pan_shader_preprocess(b.shader, inputs.gpu_id);
GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_copy_buf2buf_info), 4);
@ -1791,6 +1784,7 @@ panvk_meta_fill_buf_shader(struct panfrost_device *pdev,
struct util_dynarray binary;
util_dynarray_init(&binary, NULL);
pan_shader_preprocess(b.shader, inputs.gpu_id);
GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
shader_info->push.count = DIV_ROUND_UP(sizeof(struct panvk_meta_fill_buf_info), 4);

View file

@ -153,10 +153,6 @@ panvk_lower_blend(struct panfrost_device *pdev,
rt_state->equation.alpha_dst_factor = BLEND_FACTOR_ZERO;
rt_state->equation.alpha_invert_dst_factor = false;
lower_blend = true;
inputs->bifrost.static_rt_conv = true;
inputs->bifrost.rt_conv[rt] =
GENX(pan_blend_get_internal_desc)(pdev, fmt, rt, 32, false) >> 32;
}
if (lower_blend) {
@ -371,6 +367,17 @@ panvk_per_arch(shader_create)(struct panvk_device *dev,
nir_print_shader(nir, stderr);
}
pan_shader_preprocess(nir, inputs.gpu_id);
if (stage == MESA_SHADER_FRAGMENT) {
enum pipe_format rt_formats[MAX_RTS] = {PIPE_FORMAT_NONE};
for (unsigned rt = 0; rt < MAX_RTS; ++rt)
rt_formats[rt] = blend_state->rts[rt].format;
NIR_PASS_V(nir, GENX(pan_inline_rt_conversion), pdev, rt_formats);
}
GENX(pan_shader_compile)(nir, &inputs, &shader->binary, &shader->info);
/* System values shouldn't have changed */