From 02923e237d3c1b52ce833eb68e2ff990906645e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 15 Jun 2024 00:36:12 -0400 Subject: [PATCH] nir: add hole_size parameter into the vectorize callback It will be used to allow merging loads with a hole between them. Reviewed-by: Qiang Yu Reviewed-by: Alyssa Rosenzweig Reviewed-by: Kenneth Graunke Reviewed-by: Rhys Perry Part-of: --- src/amd/common/ac_shader_util.c | 4 ++-- src/amd/common/ac_shader_util.h | 4 ++-- src/asahi/compiler/agx_compile.c | 7 +++++-- src/broadcom/compiler/nir_to_vir.c | 4 ++++ src/compiler/nir/nir.h | 3 +++ src/compiler/nir/nir_lower_shader_calls.c | 3 ++- src/compiler/nir/nir_opt_load_store_vectorize.c | 2 +- src/compiler/nir/tests/load_store_vectorizer_tests.cpp | 7 +++++-- src/freedreno/ir3/ir3_nir.c | 5 ++++- src/gallium/auxiliary/nir/nir_to_tgsi.c | 4 +++- src/intel/compiler/brw_nir.c | 3 ++- src/intel/compiler/brw_nir.h | 1 + src/intel/compiler/elk/elk_nir.c | 5 +++-- src/intel/compiler/elk/elk_nir.h | 7 ------- src/microsoft/compiler/nir_to_dxil.c | 3 ++- src/nouveau/codegen/nv50_ir_from_nir.cpp | 5 +++++ src/nouveau/compiler/nak_nir.c | 7 +++++-- src/panfrost/compiler/bifrost_compile.c | 8 ++++++-- 18 files changed, 55 insertions(+), 27 deletions(-) diff --git a/src/amd/common/ac_shader_util.c b/src/amd/common/ac_shader_util.c index ade410ee391..171b6738a53 100644 --- a/src/amd/common/ac_shader_util.c +++ b/src/amd/common/ac_shader_util.c @@ -113,10 +113,10 @@ void ac_set_nir_options(struct radeon_info *info, bool use_llvm, bool ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, - unsigned num_components, nir_intrinsic_instr *low, + unsigned num_components, unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { - if (num_components > 4) + if (num_components > 4 || hole_size) return false; bool is_scratch = false; diff --git a/src/amd/common/ac_shader_util.h b/src/amd/common/ac_shader_util.h index d8b6e9b78c0..d2b024bedef 100644 --- a/src/amd/common/ac_shader_util.h +++ b/src/amd/common/ac_shader_util.h @@ -243,8 +243,8 @@ void ac_set_nir_options(struct radeon_info *info, bool use_llvm, nir_shader_compiler_options *options); bool ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, - unsigned num_components, nir_intrinsic_instr *low, - nir_intrinsic_instr *high, void *data); + unsigned num_components, unsigned hole_size, + nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data); unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask, bool writes_mrt0_alpha); diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c index d7a9f0ea738..bcb76ab810e 100644 --- a/src/asahi/compiler/agx_compile.c +++ b/src/asahi/compiler/agx_compile.c @@ -2795,9 +2795,12 @@ agx_optimize_loop_nir(nir_shader *nir) static bool mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size, - unsigned num_components, nir_intrinsic_instr *low, - nir_intrinsic_instr *high, void *data) + unsigned num_components, unsigned hole_size, + nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { + if (hole_size) + return false; + /* Must be aligned to the size of the load */ unsigned align = nir_combined_align(align_mul, align_offset); if ((bit_size / 8) > align) diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c index 5e38522f5c5..bc225e6dba4 100644 --- a/src/broadcom/compiler/nir_to_vir.c +++ b/src/broadcom/compiler/nir_to_vir.c @@ -2084,10 +2084,14 @@ static bool mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { + if (hole_size) + return false; + /* TMU general access only supports 32-bit vectors */ if (bit_size > 32) return false; diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 1b0304d5401..cb4df615e4e 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -5896,6 +5896,9 @@ typedef bool (*nir_should_vectorize_mem_func)(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, + /* The hole between low and + * high if they are not adjacent. */ + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data); diff --git a/src/compiler/nir/nir_lower_shader_calls.c b/src/compiler/nir/nir_lower_shader_calls.c index 1e0ecb10a81..245d6dd4c19 100644 --- a/src/compiler/nir/nir_lower_shader_calls.c +++ b/src/compiler/nir/nir_lower_shader_calls.c @@ -1920,6 +1920,7 @@ should_vectorize(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { @@ -1933,7 +1934,7 @@ should_vectorize(unsigned align_mul, struct stack_op_vectorizer_state *state = data; return state->driver_callback(align_mul, align_offset, - bit_size, num_components, + bit_size, num_components, hole_size, low, high, state->driver_data); } diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index ede9fe8a243..0b666aa716d 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -642,7 +642,7 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size, if (!ctx->options->callback(low->align_mul, low->align_offset, - new_bit_size, new_num_components, + new_bit_size, new_num_components, 0, low->intrin, high->intrin, ctx->options->cb_data)) return false; diff --git a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp index ff707f7a6d2..72db7ad0142 100644 --- a/src/compiler/nir/tests/load_store_vectorizer_tests.cpp +++ b/src/compiler/nir/tests/load_store_vectorizer_tests.cpp @@ -71,7 +71,7 @@ protected: static bool mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size, - unsigned num_components, + unsigned num_components, unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data); static void shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align); @@ -336,10 +336,13 @@ bool nir_load_store_vectorize_test::test_alu_def( bool nir_load_store_vectorize_test::mem_vectorize_callback( unsigned align_mul, unsigned align_offset, unsigned bit_size, - unsigned num_components, + unsigned num_components, unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { + if (hole_size) + return false; + /* Calculate a simple alignment, like how nir_intrinsic_align() does. */ uint32_t align = align_mul; if (align_offset) diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 899b6e9a7ed..cb5e9fdab79 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -110,9 +110,12 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data) static bool ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, - nir_intrinsic_instr *low, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { + if (hole_size) + return false; + struct ir3_compiler *compiler = data; unsigned byte_size = bit_size / 8; diff --git a/src/gallium/auxiliary/nir/nir_to_tgsi.c b/src/gallium/auxiliary/nir/nir_to_tgsi.c index af3a9920349..0701c0d44c5 100644 --- a/src/gallium/auxiliary/nir/nir_to_tgsi.c +++ b/src/gallium/auxiliary/nir/nir_to_tgsi.c @@ -3265,13 +3265,15 @@ ntt_should_vectorize_instr(const nir_instr *instr, const void *data) return 4; } +/* TODO: These parameters are wrong. */ static bool ntt_should_vectorize_io(unsigned align, unsigned bit_size, unsigned num_components, unsigned high_offset, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { - if (bit_size != 32) + if (bit_size != 32 || hole_size) return false; /* Our offset alignment should aways be at least 4 bytes */ diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 7035c244d98..5f11f53127a 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1411,6 +1411,7 @@ bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) @@ -1419,7 +1420,7 @@ brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, * those back into 32-bit ones anyway and UBO loads aren't split in NIR so * we don't want to make a mess for the back-end. */ - if (bit_size > 32) + if (bit_size > 32 || hole_size) return false; if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index a1f72a28392..169062f1c04 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -233,6 +233,7 @@ enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo, bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data); diff --git a/src/intel/compiler/elk/elk_nir.c b/src/intel/compiler/elk/elk_nir.c index d0ab5888900..3be383a85c0 100644 --- a/src/intel/compiler/elk/elk_nir.c +++ b/src/intel/compiler/elk/elk_nir.c @@ -1124,10 +1124,11 @@ elk_nir_link_shaders(const struct elk_compiler *compiler, } } -bool +static bool elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) @@ -1136,7 +1137,7 @@ elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, * those back into 32-bit ones anyway and UBO loads aren't split in NIR so * we don't want to make a mess for the back-end. */ - if (bit_size > 32) + if (bit_size > 32 || hole_size) return false; if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel || diff --git a/src/intel/compiler/elk/elk_nir.h b/src/intel/compiler/elk/elk_nir.h index f621fdcda19..8575a07b8f8 100644 --- a/src/intel/compiler/elk/elk_nir.h +++ b/src/intel/compiler/elk/elk_nir.h @@ -248,13 +248,6 @@ void elk_nir_apply_key(nir_shader *nir, unsigned elk_nir_api_subgroup_size(const nir_shader *nir, unsigned hw_subgroup_size); -bool elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, - unsigned bit_size, - unsigned num_components, - nir_intrinsic_instr *low, - nir_intrinsic_instr *high, - void *data); - void elk_nir_analyze_ubo_ranges(const struct elk_compiler *compiler, nir_shader *nir, struct elk_ubo_range out_ranges[4]); diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c index edbe0aa81d7..6df5a379805 100644 --- a/src/microsoft/compiler/nir_to_dxil.c +++ b/src/microsoft/compiler/nir_to_dxil.c @@ -6229,10 +6229,11 @@ vectorize_filter( unsigned align_offset, unsigned bit_size, unsigned num_components, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data) { - return util_is_power_of_two_nonzero(num_components); + return !hole_size && util_is_power_of_two_nonzero(num_components); } struct lower_mem_bit_sizes_data { diff --git a/src/nouveau/codegen/nv50_ir_from_nir.cpp b/src/nouveau/codegen/nv50_ir_from_nir.cpp index 9d8f48fcedf..47d3d650020 100644 --- a/src/nouveau/codegen/nv50_ir_from_nir.cpp +++ b/src/nouveau/codegen/nv50_ir_from_nir.cpp @@ -138,6 +138,7 @@ private: unsigned align_offset, unsigned bit_size, unsigned num_components, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *cb_data); @@ -1369,10 +1370,14 @@ Converter::memVectorizeCb(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, + unsigned hole_size, nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *cb_data) { + if (hole_size) + return false; + /* * Since we legalize these later with nir_lower_mem_access_bit_sizes, * we can optimistically combine anything that might be profitable diff --git a/src/nouveau/compiler/nak_nir.c b/src/nouveau/compiler/nak_nir.c index 098a60d4ece..7476c932e69 100644 --- a/src/nouveau/compiler/nak_nir.c +++ b/src/nouveau/compiler/nak_nir.c @@ -799,8 +799,8 @@ nak_nir_remove_barriers(nir_shader *nir) static bool nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, - nir_intrinsic_instr *low, nir_intrinsic_instr *high, - void *cb_data) + unsigned hole_size, nir_intrinsic_instr *low, + nir_intrinsic_instr *high, void *cb_data) { /* * Since we legalize these later with nir_lower_mem_access_bit_sizes, @@ -808,6 +808,9 @@ nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset, */ assert(util_is_power_of_two_nonzero(align_mul)); + if (hole_size) + return false; + unsigned max_bytes = 128u / 8u; if (low->intrinsic == nir_intrinsic_ldc_nv || low->intrinsic == nir_intrinsic_ldcx_nv) diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index eaea02848a2..248d0c55b48 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -4632,9 +4632,13 @@ mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes, static bool mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size, - unsigned num_components, nir_intrinsic_instr *low, - nir_intrinsic_instr *high, void *data) + unsigned num_components, unsigned hole_size, + nir_intrinsic_instr *low, nir_intrinsic_instr *high, + void *data) { + if (hole_size) + return false; + /* Must be aligned to the size of the load */ unsigned align = nir_combined_align(align_mul, align_offset); if ((bit_size / 8) > align)