diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 44008cb8c95..ee37f71bc98 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -691,11 +691,6 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) 32 /* bytes */); OPT_V(s, ir3_nir_lower_load_constant, so); - if (!so->binning_pass) - OPT_V(s, ir3_nir_analyze_ubo_ranges, so); - - progress |= OPT(s, ir3_nir_lower_ubo_loads, so); - /* Lower large temporaries to scratch, which in Qualcomm terms is private * memory, to avoid excess register pressure. This should happen after * nir_opt_large_constants, because loading from a UBO is much, much less @@ -709,6 +704,14 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) /* Lower scratch writemasks */ progress |= OPT(s, nir_lower_wrmasks, should_split_wrmask, s); + progress |= OPT(s, ir3_nir_lower_64b_intrinsics); + progress |= OPT(s, nir_lower_int64); + + if (!so->binning_pass) + OPT_V(s, ir3_nir_analyze_ubo_ranges, so); + + progress |= OPT(s, ir3_nir_lower_ubo_loads, so); + OPT_V(s, nir_lower_amul, ir3_glsl_type_size); /* UBO offset lowering has to come after we've decided what will diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index d1049364f65..1f9e9c7cb87 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -54,6 +54,11 @@ void ir3_nir_lower_tess_eval(nir_shader *shader, struct ir3_shader_variant *v, unsigned topology); void ir3_nir_lower_gs(nir_shader *shader); +/* + * 64b related lowering: + */ +bool ir3_nir_lower_64b_intrinsics(nir_shader *shader); + const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler); void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s); @@ -89,4 +94,38 @@ ir3_bindless_resource(nir_src src) return intrin; } +static inline bool +is_intrinsic_store(nir_intrinsic_op op) +{ + switch (op) { + case nir_intrinsic_store_output: + case nir_intrinsic_store_scratch: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_shared: + case nir_intrinsic_store_global: + case nir_intrinsic_store_global_ir3: + return true; + default: + return false; + } +} + +static inline bool +is_intrinsic_load(nir_intrinsic_op op) +{ + switch (op) { + case nir_intrinsic_load_input: + case nir_intrinsic_load_scratch: + case nir_intrinsic_load_uniform: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_shared: + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_ir3: + return true; + default: + return false; + } +} + #endif /* IR3_NIR_H_ */ diff --git a/src/freedreno/ir3/ir3_nir_lower_64b.c b/src/freedreno/ir3/ir3_nir_lower_64b.c new file mode 100644 index 00000000000..857f5835269 --- /dev/null +++ b/src/freedreno/ir3/ir3_nir_lower_64b.c @@ -0,0 +1,175 @@ +/* + * Copyright © 2021 Google, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ir3_nir.h" + +/* + * Lowering for 64b intrinsics generated with OpenCL or with + * VK_KHR_buffer_device_address. All our intrinsics from a hw + * standpoint are 32b, so we just need to combine in zero for + * the upper 32bits and let the other nir passes clean up the mess. + */ + +static bool +lower_64b_intrinsics_filter(const nir_instr *instr, const void *unused) +{ + (void)unused; + + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + if (intr->intrinsic == nir_intrinsic_load_deref || + intr->intrinsic == nir_intrinsic_store_deref) + return false; + + if (is_intrinsic_store(intr->intrinsic)) + return nir_src_bit_size(intr->src[0]) == 64; + + if (nir_intrinsic_dest_components(intr) == 0) + return false; + + return nir_dest_bit_size(intr->dest) == 64; +} + +static nir_ssa_def * +lower_64b_intrinsics(nir_builder *b, nir_instr *instr, void *unused) +{ + (void)unused; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + /* We could be *slightly* more clever and, for ex, turn a 64b vec4 + * load into two 32b vec4 loads, rather than 4 32b vec2 loads. + */ + + if (is_intrinsic_store(intr->intrinsic)) { + unsigned offset_src_idx; + switch (intr->intrinsic) { + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_global_ir3: + offset_src_idx = 2; + break; + default: + offset_src_idx = 1; + } + + unsigned num_comp = nir_intrinsic_src_components(intr, 0); + unsigned wrmask = nir_intrinsic_has_write_mask(intr) ? + nir_intrinsic_write_mask(intr) : BITSET_MASK(num_comp); + nir_ssa_def *val = nir_ssa_for_src(b, intr->src[0], num_comp); + nir_ssa_def *off = nir_ssa_for_src(b, intr->src[offset_src_idx], 1); + + for (unsigned i = 0; i < num_comp; i++) { + if (!(wrmask & BITFIELD_BIT(i))) + continue; + + nir_ssa_def *c64 = nir_channel(b, val, i); + nir_ssa_def *c32 = nir_unpack_64_2x32(b, c64); + + nir_intrinsic_instr *store = + nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr)); + store->num_components = 2; + store->src[0] = nir_src_for_ssa(c32); + store->src[offset_src_idx] = nir_src_for_ssa(off); + + if (nir_intrinsic_has_write_mask(intr)) + nir_intrinsic_set_write_mask(store, 0x3); + nir_builder_instr_insert(b, &store->instr); + + off = nir_iadd(b, off, nir_imm_intN_t(b, 8, off->bit_size)); + } + + return NIR_LOWER_INSTR_PROGRESS_REPLACE; + } + + unsigned num_comp = nir_intrinsic_dest_components(intr); + + nir_ssa_def *def = &intr->dest.ssa; + def->bit_size = 32; + + /* load_kernel_input is handled specially, lowering to two 32b inputs: + */ + if (intr->intrinsic == nir_intrinsic_load_kernel_input) { + assert(num_comp == 1); + + nir_ssa_def *offset = nir_iadd(b, + nir_ssa_for_src(b, intr->src[0], 1), + nir_imm_int(b, 4)); + + nir_ssa_def *upper = nir_build_load_kernel_input( + b, 1, 32, offset); + + return nir_pack_64_2x32_split(b, def, upper); + } + + nir_ssa_def *components[num_comp]; + + if (is_intrinsic_load(intr->intrinsic)) { + unsigned offset_src_idx; + switch(intr->intrinsic) { + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_global_ir3: + offset_src_idx = 1; + break; + default: + offset_src_idx = 0; + } + + nir_ssa_def *off = nir_ssa_for_src(b, intr->src[offset_src_idx], 1); + + for (unsigned i = 0; i < num_comp; i++) { + nir_intrinsic_instr *load = + nir_instr_as_intrinsic(nir_instr_clone(b->shader, &intr->instr)); + load->num_components = 2; + load->src[offset_src_idx] = nir_src_for_ssa(off); + + nir_ssa_dest_init(&load->instr, &load->dest, 2, 32, NULL); + nir_builder_instr_insert(b, &load->instr); + + components[i] = nir_pack_64_2x32(b, &load->dest.ssa); + + off = nir_iadd(b, off, nir_imm_intN_t(b, 8, off->bit_size)); + } + } else { + /* The remaining (non load/store) intrinsics just get zero- + * extended from 32b to 64b: + */ + for (unsigned i = 0; i < num_comp; i++) { + nir_ssa_def *c = nir_channel(b, def, i); + components[i] = nir_pack_64_2x32_split(b, c, nir_imm_zero(b, 1, 32)); + } + } + + return nir_build_alu_src_arr(b, nir_op_vec(num_comp), components); +} + +bool +ir3_nir_lower_64b_intrinsics(nir_shader *shader) +{ + return nir_shader_lower_instructions( + shader, lower_64b_intrinsics_filter, + lower_64b_intrinsics, NULL); +} diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index 40501bbde2c..127a6a5abfa 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -94,6 +94,7 @@ libfreedreno_ir3_files = files( 'ir3_nir.c', 'ir3_nir.h', 'ir3_nir_analyze_ubo_ranges.c', + 'ir3_nir_lower_64b.c', 'ir3_nir_lower_load_barycentric_at_sample.c', 'ir3_nir_lower_load_barycentric_at_offset.c', 'ir3_nir_lower_io_offsets.c',