diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index d9e1f292fd0..5512bbea384 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -4393,6 +4393,35 @@ bifrost_nir_lower_blend_components(struct nir_builder *b, nir_instr *instr, return true; } +static nir_mem_access_size_align +mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes, uint32_t align, + uint32_t align_offset, bool offset_is_const, + const void *cb_data) +{ + align = nir_combined_align(align, align_offset); + assert(util_is_power_of_two_nonzero(align)); + + /* If the number of bytes is a multiple of 4, use 32-bit loads. Else if it's + * a multiple of 2, use 16-bit loads. Else use 8-bit loads. + */ + unsigned bit_size = (bytes & 1) ? 8 : (bytes & 2) ? 16 : 32; + + /* But if we're only aligned to 1 byte, use 8-bit loads. If we're only + * aligned to 2 bytes, use 16-bit loads, unless we needed 8-bit loads due to + * the size. + */ + if (align == 1) + bit_size = 8; + else if (align == 2) + bit_size = MIN2(bit_size, 16); + + return (nir_mem_access_size_align){ + .num_components = MIN2(bytes / (bit_size / 8), 4), + .bit_size = bit_size, + .align = bit_size / 8, + }; +} + static void bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) { @@ -4760,6 +4789,13 @@ bifrost_preprocess_nir(nir_shader *nir, unsigned gpu_id) NIR_PASS_V(nir, pan_nir_lower_store_component); } + NIR_PASS_V(nir, nir_lower_mem_access_bit_sizes, + nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_constant | + nir_var_mem_task_payload | nir_var_shader_temp | + nir_var_function_temp | nir_var_mem_global | + nir_var_mem_shared, + mem_access_size_align_cb, NULL); + NIR_PASS_V(nir, nir_lower_ssbo); NIR_PASS_V(nir, pan_lower_sample_pos); NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL);