nir/opt_load_store_vectorize: make hole_size signed to indicate overlapping loads

A negative hole size means the loads overlap. This will be used by drivers
to handle overlapping loads in the callback easily.

Reviewed-by: Mel Henning <drawoc@darkrefraction.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32699>
This commit is contained in:
Marek Olšák 2024-12-18 05:11:33 -05:00 committed by Marge Bot
parent 3ba3e00750
commit c21bc65ba7
18 changed files with 31 additions and 32 deletions

View file

@ -115,10 +115,10 @@ void ac_set_nir_options(struct radeon_info *info, bool use_llvm,
bool
ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
unsigned num_components, unsigned hole_size, nir_intrinsic_instr *low,
unsigned num_components, int64_t hole_size, nir_intrinsic_instr *low,
nir_intrinsic_instr *high, void *data)
{
if (num_components > 4 || hole_size)
if (num_components > 4 || hole_size > 0)
return false;
bool is_scratch = false;

View file

@ -245,7 +245,7 @@ void ac_set_nir_options(struct radeon_info *info, bool use_llvm,
nir_shader_compiler_options *options);
bool ac_nir_mem_vectorize_callback(unsigned align_mul, unsigned align_offset, unsigned bit_size,
unsigned num_components, unsigned hole_size,
unsigned num_components, int64_t hole_size,
nir_intrinsic_instr *low, nir_intrinsic_instr *high, void *data);
unsigned ac_get_spi_shader_z_format(bool writes_z, bool writes_stencil, bool writes_samplemask,

View file

@ -2839,10 +2839,10 @@ agx_optimize_loop_nir(nir_shader *nir)
bool
agx_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
unsigned bit_size, unsigned num_components,
unsigned hole_size, nir_intrinsic_instr *low,
int64_t hole_size, nir_intrinsic_instr *low,
nir_intrinsic_instr *high, void *data)
{
if (hole_size)
if (hole_size > 0)
return false;
/* Must be aligned to the size of the load */

View file

@ -305,7 +305,7 @@ bool agx_nir_lower_cull_distance_fs(struct nir_shader *s,
unsigned nr_distances);
bool agx_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
unsigned bit_size, unsigned num_components,
unsigned hole_size, nir_intrinsic_instr *low,
int64_t hole_size, nir_intrinsic_instr *low,
nir_intrinsic_instr *high, void *data);
void agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,

View file

@ -2089,12 +2089,12 @@ static bool
mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
unsigned bit_size,
unsigned num_components,
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *data)
{
if (hole_size || !nir_num_components_valid(num_components))
if (hole_size > 0 || !nir_num_components_valid(num_components))
return false;
/* TMU general access only supports 32-bit vectors */

View file

@ -6007,7 +6007,7 @@ typedef bool (*nir_should_vectorize_mem_func)(unsigned align_mul,
unsigned num_components,
/* The hole between low and
* high if they are not adjacent. */
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *data);

View file

@ -1920,7 +1920,7 @@ should_vectorize(unsigned align_mul,
unsigned align_offset,
unsigned bit_size,
unsigned num_components,
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
void *data)
{

View file

@ -669,8 +669,7 @@ new_bitsize_acceptable(struct vectorize_ctx *ctx, unsigned new_bit_size,
unsigned low_size = low->intrin->num_components * get_bit_size(low) / 8;
/* The hole size can be less than 0 if low and high instructions overlap. */
unsigned hole_size =
MAX2(high->offset_signed - (low->offset_signed + low_size), 0);
int64_t hole_size = high->offset_signed - (low->offset_signed + low_size);
if (!ctx->options->callback(low->align_mul,
low->align_offset,

View file

@ -71,7 +71,7 @@ protected:
static bool mem_vectorize_callback(unsigned align_mul, unsigned align_offset,
unsigned bit_size,
unsigned num_components, unsigned hole_size,
unsigned num_components, int64_t hole_size,
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
void *data);
static void shared_type_info(const struct glsl_type *type, unsigned *size, unsigned *align);
@ -83,7 +83,7 @@ protected:
std::map<unsigned, nir_def*> res_map;
unsigned max_components = 4;
bool overfetch = false;
unsigned max_hole_size = 0;
int64_t max_hole_size = 0;
};
std::string
@ -340,7 +340,7 @@ bool nir_load_store_vectorize_test::test_alu_def(
bool nir_load_store_vectorize_test::mem_vectorize_callback(
unsigned align_mul, unsigned align_offset, unsigned bit_size,
unsigned num_components, unsigned hole_size,
unsigned num_components, int64_t hole_size,
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
void *data)
{

View file

@ -163,10 +163,10 @@ ir3_nir_should_scalarize_mem(const nir_instr *instr, const void *data)
static bool
ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned bit_size, unsigned num_components,
unsigned hole_size, nir_intrinsic_instr *low,
int64_t hole_size, nir_intrinsic_instr *low,
nir_intrinsic_instr *high, void *data)
{
if (hole_size || !nir_num_components_valid(num_components))
if (hole_size > 0 || !nir_num_components_valid(num_components))
return false;
struct ir3_compiler *compiler = data;

View file

@ -3269,11 +3269,11 @@ ntt_should_vectorize_instr(const nir_instr *instr, const void *data)
static bool
ntt_should_vectorize_io(unsigned align, unsigned bit_size,
unsigned num_components, unsigned high_offset,
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
void *data)
{
if (bit_size != 32 || hole_size || !nir_num_components_valid(num_components))
if (bit_size != 32 || hole_size > 0 || !nir_num_components_valid(num_components))
return false;
/* Our offset alignment should aways be at least 4 bytes */

View file

@ -1465,7 +1465,7 @@ bool
brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned bit_size,
unsigned num_components,
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *data)

View file

@ -234,7 +234,7 @@ enum brw_reg_type brw_type_for_nir_type(const struct intel_device_info *devinfo,
bool brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned bit_size,
unsigned num_components,
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *data);

View file

@ -1129,7 +1129,7 @@ static bool
elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned bit_size,
unsigned num_components,
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *data)
@ -1138,7 +1138,7 @@ elk_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
* those back into 32-bit ones anyway and UBO loads aren't split in NIR so
* we don't want to make a mess for the back-end.
*/
if (bit_size > 32 || hole_size || !nir_num_components_valid(num_components))
if (bit_size > 32 || hole_size > 0 || !nir_num_components_valid(num_components))
return false;
if (low->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel ||

View file

@ -6232,11 +6232,11 @@ vectorize_filter(
unsigned align_offset,
unsigned bit_size,
unsigned num_components,
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
void *data)
{
return !hole_size && util_is_power_of_two_nonzero(num_components);
return hole_size <= 0 && util_is_power_of_two_nonzero(num_components);
}
struct lower_mem_bit_sizes_data {

View file

@ -138,7 +138,7 @@ private:
unsigned align_offset,
unsigned bit_size,
unsigned num_components,
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *cb_data);
@ -1371,12 +1371,12 @@ Converter::memVectorizeCb(unsigned align_mul,
unsigned align_offset,
unsigned bit_size,
unsigned num_components,
unsigned hole_size,
int64_t hole_size,
nir_intrinsic_instr *low,
nir_intrinsic_instr *high,
void *cb_data)
{
if (hole_size)
if (hole_size > 0)
return false;
/*

View file

@ -799,7 +799,7 @@ nak_nir_remove_barriers(nir_shader *nir)
static bool
nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
unsigned bit_size, unsigned num_components,
unsigned hole_size, nir_intrinsic_instr *low,
int64_t hole_size, nir_intrinsic_instr *low,
nir_intrinsic_instr *high, void *cb_data)
{
/*
@ -808,7 +808,7 @@ nak_mem_vectorize_cb(unsigned align_mul, unsigned align_offset,
*/
assert(util_is_power_of_two_nonzero(align_mul));
if (hole_size)
if (hole_size > 0)
return false;
unsigned max_bytes = 128u / 8u;

View file

@ -4809,11 +4809,11 @@ mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,
static bool
mem_vectorize_cb(unsigned align_mul, unsigned align_offset, unsigned bit_size,
unsigned num_components, unsigned hole_size,
unsigned num_components, int64_t hole_size,
nir_intrinsic_instr *low, nir_intrinsic_instr *high,
void *data)
{
if (hole_size)
if (hole_size > 0)
return false;
/* Must be aligned to the size of the load */