diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c index 5839c5eb908..2ed56936665 100644 --- a/src/gallium/drivers/iris/iris_program_cache.c +++ b/src/gallium/drivers/iris/iris_program_cache.c @@ -325,11 +325,9 @@ iris_destroy_program_cache(struct iris_context *ice) static void link_libintel_shaders(nir_shader *nir, - const struct intel_device_info *devinfo, const uint32_t *spv_code, uint32_t spv_size) { - nir_shader *libintel = brw_nir_from_spirv(nir, devinfo->ver, - spv_code, spv_size, true); + nir_shader *libintel = brw_nir_from_spirv(nir, spv_code, spv_size); nir_link_shader_functions(nir, libintel); NIR_PASS_V(nir, nir_inline_functions); @@ -342,6 +340,7 @@ link_libintel_shaders(nir_shader *nir, nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared | nir_var_mem_global, nir_address_format_62bit_generic); + NIR_PASS_V(nir, nir_lower_scratch_to_var); } void @@ -378,7 +377,7 @@ iris_ensure_indirect_generation_shader(struct iris_batch *batch) nir_shader *nir = b.shader; - link_libintel_shaders(nir, screen->devinfo, spv_code, spv_size); + link_libintel_shaders(nir, spv_code, spv_size); NIR_PASS_V(nir, nir_lower_vars_to_ssa); NIR_PASS_V(nir, nir_opt_cse); diff --git a/src/intel/compiler/brw_kernel.h b/src/intel/compiler/brw_kernel.h index 2c2efd81a04..95e5972c716 100644 --- a/src/intel/compiler/brw_kernel.h +++ b/src/intel/compiler/brw_kernel.h @@ -66,10 +66,6 @@ brw_kernel_from_spirv(struct brw_compiler *compiler, const char *entrypoint_name, char **error_str); -nir_shader * -brw_nir_from_spirv(void *mem_ctx, unsigned gfx_version, - const uint32_t *spirv, size_t spirv_size, bool llvm17_wa); - #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 5f193803ee8..b9caca2f31b 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -294,8 +294,7 @@ brw_nir_no_indirect_mask(const struct brw_compiler *compiler, bool brw_nir_uses_inline_data(nir_shader *shader); nir_shader * -brw_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv, - size_t spirv_size, bool llvm17_wa); +brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size); #ifdef __cplusplus } diff --git a/src/intel/compiler/brw_spirv.c b/src/intel/compiler/brw_spirv.c index 566bb8b9af3..b44796acca2 100644 --- a/src/intel/compiler/brw_spirv.c +++ b/src/intel/compiler/brw_spirv.c @@ -13,167 +13,70 @@ #include "dev/intel_debug.h" #include "util/u_dynarray.h" -static nir_def * -rebuild_value_from_store(struct util_dynarray *stores, - nir_def *value, unsigned read_offset) -{ - unsigned read_size = value->num_components * value->bit_size / 8; - - util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) { - nir_intrinsic_instr *store = *_store; - - unsigned write_offset = nir_src_as_uint(store->src[1]); - unsigned write_size = nir_src_num_components(store->src[0]) * - nir_src_bit_size(store->src[0]) / 8; - if (write_offset <= read_offset && - (write_offset + write_size) >= (read_offset + read_size)) { - assert(nir_block_dominates(store->instr.block, value->parent_instr->block)); - assert(write_size == read_size); - return store->src[0].ssa; - } - } - unreachable("Matching scratch store not found"); -} - -/** - * Remove temporary variables stored to scratch to be then reloaded - * immediately. Remap the load to the store SSA value. - * - * This workaround is only meant to be applied to shaders in src/intel/shaders - * were we know there should be no issue. More complex cases might not work - * with this approach. - */ -static bool -nir_remove_llvm17_scratch(nir_shader *nir) -{ - struct util_dynarray scratch_stores; - void *mem_ctx = ralloc_context(NULL); - - util_dynarray_init(&scratch_stores, mem_ctx); - - nir_foreach_function_impl(func, nir) { - nir_foreach_block(block, func) { - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - - if (intrin->intrinsic != nir_intrinsic_store_scratch) - continue; - - nir_const_value *offset = nir_src_as_const_value(intrin->src[1]); - if (offset != NULL) { - util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin); - } - } - } - } - - bool progress = false; - if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) { - nir_foreach_function_impl(func, nir) { - nir_foreach_block(block, func) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - - if (intrin->intrinsic != nir_intrinsic_load_scratch) - continue; - - nir_const_value *offset = nir_src_as_const_value(intrin->src[0]); - if (offset == NULL) - continue; - - nir_def_replace(&intrin->def, - rebuild_value_from_store(&scratch_stores, &intrin->def, nir_src_as_uint(intrin->src[0]))); - - progress = true; - } - } - } - } - - util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) { - nir_intrinsic_instr *store = *_store; - nir_instr_remove(&store->instr); - } - - /* Quick sanity check */ - assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 || - progress); - - ralloc_free(mem_ctx); - - return progress; -} - static void -cleanup_llvm17_scratch(nir_shader *nir) +optimize(nir_shader *nir) { - { - bool progress; - do { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_algebraic); - } while (progress); - } + bool progress; + do { + progress = false; - nir_remove_llvm17_scratch(nir); + NIR_PASS(progress, nir, nir_split_var_copies); + NIR_PASS(progress, nir, nir_split_struct_vars, nir_var_function_temp); + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - { - bool progress; - do { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_algebraic); - } while (progress); - } + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_remove_phis); + NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); + NIR_PASS(progress, nir, nir_opt_phi_precision); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + + NIR_PASS(progress, nir, nir_opt_deref); + NIR_PASS(progress, nir, nir_opt_copy_prop_vars); + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_lower_undef_to_zero); + + NIR_PASS(progress, nir, nir_opt_shrink_vectors, true); + NIR_PASS(progress, nir, nir_opt_loop_unroll); + + } while (progress); } -static const struct spirv_capabilities spirv_caps = { - .Addresses = true, - .Float16 = true, - .Float64 = true, - .Groups = true, - .StorageImageWriteWithoutFormat = true, - .Int8 = true, - .Int16 = true, - .Int64 = true, - .Int64Atomics = true, - .Kernel = true, - .Linkage = true, /* We receive linked kernel from clc */ - .DenormFlushToZero = true, - .DenormPreserve = true, - .SignedZeroInfNanPreserve = true, - .RoundingModeRTE = true, - .RoundingModeRTZ = true, - .GenericPointer = true, - .GroupNonUniform = true, - .GroupNonUniformArithmetic = true, - .GroupNonUniformClustered = true, - .GroupNonUniformBallot = true, - .GroupNonUniformQuad = true, - .GroupNonUniformShuffle = true, - .GroupNonUniformVote = true, - .SubgroupDispatch = true, -}; - nir_shader * -brw_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv, - size_t spirv_size, bool llvm17_wa) +brw_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size) { - assert(gfx_version >= 9); - + static const struct spirv_capabilities spirv_caps = { + .Addresses = true, + .Float16 = true, + .Float64 = true, + .Groups = true, + .StorageImageWriteWithoutFormat = true, + .Int8 = true, + .Int16 = true, + .Int64 = true, + .Int64Atomics = true, + .Kernel = true, + .Linkage = true, /* We receive linked kernel from clc */ + .DenormFlushToZero = true, + .DenormPreserve = true, + .SignedZeroInfNanPreserve = true, + .RoundingModeRTE = true, + .RoundingModeRTZ = true, + .GenericPointer = true, + .GroupNonUniform = true, + .GroupNonUniformArithmetic = true, + .GroupNonUniformClustered = true, + .GroupNonUniformBallot = true, + .GroupNonUniformQuad = true, + .GroupNonUniformShuffle = true, + .GroupNonUniformVote = true, + .SubgroupDispatch = true, + }; struct spirv_to_nir_options spirv_options = { .environment = NIR_SPIRV_OPENCL, .capabilities = &spirv_caps, @@ -197,163 +100,79 @@ brw_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv, ralloc_steal(mem_ctx, nir); nir->info.name = ralloc_strdup(nir, "library"); - if (INTEL_DEBUG(DEBUG_CS)) { - /* Re-index SSA defs so we print more sensible numbers. */ - nir_foreach_function_impl(impl, nir) { - nir_index_ssa_defs(impl); - } + nir_fixup_is_exported(nir); - fprintf(stderr, "NIR (from SPIR-V) for kernel\n"); - nir_print_shader(nir, stderr); - } + NIR_PASS(_, nir, nir_lower_system_values); + NIR_PASS(_, nir, nir_lower_calls_to_builtins); - nir_lower_printf_options printf_opts = { - .ptr_bit_size = 64, - .use_printf_base_identifier = true, - }; - NIR_PASS_V(nir, nir_lower_printf, &printf_opts); + NIR_PASS_V(nir, nir_lower_printf, &(const struct nir_lower_printf_options) { + .ptr_bit_size = 64, + .use_printf_base_identifier = true, + }); - NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader); + NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_function_temp); + NIR_PASS(_, nir, nir_lower_returns); + NIR_PASS(_, nir, nir_inline_functions); + //nir_remove_non_exported(nir); + NIR_PASS(_, nir, nir_copy_prop); + NIR_PASS(_, nir, nir_opt_deref); - /* We have to lower away local constant initializers right before we - * inline functions. That way they get properly initialized at the top - * of the function and not at the top of its caller. + /* We can't deal with constant data, get rid of it */ + nir_lower_constant_to_temp(nir); + + /* We can go ahead and lower the rest of the constant initializers. We do + * this here so that nir_remove_dead_variables and split_per_member_structs + * below see the corresponding stores. */ - NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp | - nir_var_function_temp)); - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo | - nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL); - { - bool progress; - do - { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_copy_prop_vars); - NIR_PASS(progress, nir, nir_opt_deref); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_undef); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_opt_algebraic); - } while (progress); - } + NIR_PASS(_, nir, nir_lower_variable_initializers, ~0); - NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp); - NIR_PASS_V(nir, nir_lower_returns); - NIR_PASS_V(nir, nir_inline_functions); - - assert(nir->scratch_size == 0); - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align); - - { - bool progress; - do - { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_copy_prop_vars); - NIR_PASS(progress, nir, nir_opt_deref); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_undef); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_split_var_copies); - NIR_PASS(progress, nir, nir_lower_var_copies); - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false); - NIR_PASS(progress, nir, nir_opt_dead_cf); - NIR_PASS(progress, nir, nir_opt_remove_phis); - NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); - NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform); - NIR_PASS(progress, nir, nir_opt_memcpy); - } while (progress); - } - - NIR_PASS_V(nir, nir_scale_fdiv); - - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo | - nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL); - - - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL); - - nir->scratch_size = 0; - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, - nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | - nir_var_mem_global | nir_var_mem_constant, - glsl_get_cl_type_size_align); - - // Lower memcpy - needs to wait until types are sized - { - bool progress; - do { - progress = false; - NIR_PASS(progress, nir, nir_opt_memcpy); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_copy_prop_vars); - NIR_PASS(progress, nir, nir_opt_deref); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_split_var_copies); - NIR_PASS(progress, nir, nir_lower_var_copies); - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - } while (progress); - } - NIR_PASS_V(nir, nir_lower_memcpy); - - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform, - nir_address_format_32bit_offset_as_64bit); - - NIR_PASS_V(nir, nir_lower_system_values); - - /* Hopefully we can drop this once lower_vars_to_ssa has improved to not - * lower everything to scratch. + /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B + * aligned and so it can just read/write them as vec4s. This results in a + * LOT of vec4->vec3 casts on loads and stores. One solution to this + * problem is to get rid of all vec3 variables. */ - if (llvm17_wa) - cleanup_llvm17_scratch(nir); + NIR_PASS(_, nir, nir_lower_vec3_to_vec4, + nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared | + nir_var_mem_global | nir_var_mem_constant); + + /* We assign explicit types early so that the optimizer can take advantage + * of that information and hopefully get rid of some of our memcpys. + */ + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, + nir_var_uniform | nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global, + glsl_get_cl_type_size_align); + + optimize(nir); + + NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL); /* Lower again, this time after dead-variables to get more compact variable * layouts. */ - nir->global_mem_size = 0; - nir->scratch_size = 0; - nir->info.shared_size = 0; - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, - nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant, - glsl_get_cl_type_size_align); - if (nir->constant_data_size > 0) { - assert(nir->constant_data == NULL); - nir->constant_data = rzalloc_size(nir, nir->constant_data_size); - nir_gather_explicit_io_initializers(nir, nir->constant_data, - nir->constant_data_size, - nir_var_mem_constant); - } + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, + nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared | + nir_var_mem_global | nir_var_mem_constant, + glsl_get_cl_type_size_align); + assert(nir->constant_data_size == 0); - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant, - nir_address_format_64bit_global); + NIR_PASS(_, nir, nir_lower_memcpy); - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform, - nir_address_format_32bit_offset_as_64bit); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_constant, + nir_address_format_64bit_global); - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_shader_temp | nir_var_function_temp | - nir_var_mem_shared | nir_var_mem_global, - nir_address_format_62bit_generic); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_uniform, + nir_address_format_64bit_global); - if (INTEL_DEBUG(DEBUG_CS)) { - /* Re-index SSA defs so we print more sensible numbers. */ - nir_foreach_function_impl(impl, nir) { - nir_index_ssa_defs(impl); - } + /* Note: we cannot lower explicit I/O here, because we need derefs in tact + * for function calls into the library to work. + */ - fprintf(stderr, "NIR (before I/O lowering) for kernel\n"); - nir_print_shader(nir, stderr); - } + NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL); + NIR_PASS(_, nir, nir_opt_if, 0); + NIR_PASS(_, nir, nir_opt_idiv_const, 16); + + optimize(nir); return nir; } diff --git a/src/intel/compiler/elk/elk_nir.h b/src/intel/compiler/elk/elk_nir.h index 48c93eecacc..934006f02f8 100644 --- a/src/intel/compiler/elk/elk_nir.h +++ b/src/intel/compiler/elk/elk_nir.h @@ -275,8 +275,7 @@ const struct glsl_type *elk_nir_get_var_type(const struct nir_shader *nir, void elk_nir_adjust_payload(nir_shader *shader); nir_shader * -elk_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv, - size_t spirv_size, bool llvm17_wa); +elk_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size); #ifdef __cplusplus } diff --git a/src/intel/compiler/elk/elk_spirv.c b/src/intel/compiler/elk/elk_spirv.c index 745b824a992..a4f73f60ddc 100644 --- a/src/intel/compiler/elk/elk_spirv.c +++ b/src/intel/compiler/elk/elk_spirv.c @@ -13,167 +13,70 @@ #include "dev/intel_debug.h" #include "util/u_dynarray.h" -static nir_def * -rebuild_value_from_store(struct util_dynarray *stores, - nir_def *value, unsigned read_offset) -{ - unsigned read_size = value->num_components * value->bit_size / 8; - - util_dynarray_foreach(stores, nir_intrinsic_instr *, _store) { - nir_intrinsic_instr *store = *_store; - - unsigned write_offset = nir_src_as_uint(store->src[1]); - unsigned write_size = nir_src_num_components(store->src[0]) * - nir_src_bit_size(store->src[0]) / 8; - if (write_offset <= read_offset && - (write_offset + write_size) >= (read_offset + read_size)) { - assert(nir_block_dominates(store->instr.block, value->parent_instr->block)); - assert(write_size == read_size); - return store->src[0].ssa; - } - } - unreachable("Matching scratch store not found"); -} - -/** - * Remove temporary variables stored to scratch to be then reloaded - * immediately. Remap the load to the store SSA value. - * - * This workaround is only meant to be applied to shaders in src/intel/shaders - * were we know there should be no issue. More complex cases might not work - * with this approach. - */ -static bool -nir_remove_llvm17_scratch(nir_shader *nir) -{ - struct util_dynarray scratch_stores; - void *mem_ctx = ralloc_context(NULL); - - util_dynarray_init(&scratch_stores, mem_ctx); - - nir_foreach_function_impl(func, nir) { - nir_foreach_block(block, func) { - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - - if (intrin->intrinsic != nir_intrinsic_store_scratch) - continue; - - nir_const_value *offset = nir_src_as_const_value(intrin->src[1]); - if (offset != NULL) { - util_dynarray_append(&scratch_stores, nir_intrinsic_instr *, intrin); - } - } - } - } - - bool progress = false; - if (util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) > 0) { - nir_foreach_function_impl(func, nir) { - nir_foreach_block(block, func) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - - if (intrin->intrinsic != nir_intrinsic_load_scratch) - continue; - - nir_const_value *offset = nir_src_as_const_value(intrin->src[0]); - if (offset == NULL) - continue; - - nir_def_replace(&intrin->def, - rebuild_value_from_store(&scratch_stores, &intrin->def, nir_src_as_uint(intrin->src[0]))); - - progress = true; - } - } - } - } - - util_dynarray_foreach(&scratch_stores, nir_intrinsic_instr *, _store) { - nir_intrinsic_instr *store = *_store; - nir_instr_remove(&store->instr); - } - - /* Quick sanity check */ - assert(util_dynarray_num_elements(&scratch_stores, nir_intrinsic_instr *) == 0 || - progress); - - ralloc_free(mem_ctx); - - return progress; -} - static void -cleanup_llvm17_scratch(nir_shader *nir) +optimize(nir_shader *nir) { - { - bool progress; - do { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_algebraic); - } while (progress); - } + bool progress; + do { + progress = false; - nir_remove_llvm17_scratch(nir); + NIR_PASS(progress, nir, nir_split_var_copies); + NIR_PASS(progress, nir, nir_split_struct_vars, nir_var_function_temp); + NIR_PASS(progress, nir, nir_lower_var_copies); + NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - { - bool progress; - do { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_opt_algebraic); - } while (progress); - } + NIR_PASS(progress, nir, nir_copy_prop); + NIR_PASS(progress, nir, nir_opt_remove_phis); + NIR_PASS(progress, nir, nir_lower_phis_to_scalar, true); + NIR_PASS(progress, nir, nir_opt_dce); + NIR_PASS(progress, nir, nir_opt_dead_cf); + NIR_PASS(progress, nir, nir_opt_cse); + NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); + NIR_PASS(progress, nir, nir_opt_phi_precision); + NIR_PASS(progress, nir, nir_opt_algebraic); + NIR_PASS(progress, nir, nir_opt_constant_folding); + + NIR_PASS(progress, nir, nir_opt_deref); + NIR_PASS(progress, nir, nir_opt_copy_prop_vars); + NIR_PASS(progress, nir, nir_opt_undef); + NIR_PASS(progress, nir, nir_lower_undef_to_zero); + + NIR_PASS(progress, nir, nir_opt_shrink_vectors, true); + NIR_PASS(progress, nir, nir_opt_loop_unroll); + + } while (progress); } -static const struct spirv_capabilities spirv_caps = { - .Addresses = true, - .Float16 = true, - .Float64 = true, - .Groups = true, - .StorageImageWriteWithoutFormat = true, - .Int8 = true, - .Int16 = true, - .Int64 = true, - .Int64Atomics = true, - .Kernel = true, - .Linkage = true, /* We receive linked kernel from clc */ - .DenormFlushToZero = true, - .DenormPreserve = true, - .SignedZeroInfNanPreserve = true, - .RoundingModeRTE = true, - .RoundingModeRTZ = true, - .GenericPointer = true, - .GroupNonUniform = true, - .GroupNonUniformArithmetic = true, - .GroupNonUniformClustered = true, - .GroupNonUniformBallot = true, - .GroupNonUniformQuad = true, - .GroupNonUniformShuffle = true, - .GroupNonUniformVote = true, - .SubgroupDispatch = true, -}; - nir_shader * -elk_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv, - size_t spirv_size, bool llvm17_wa) +elk_nir_from_spirv(void *mem_ctx, const uint32_t *spirv, size_t spirv_size) { - assert(gfx_version < 9); - + static const struct spirv_capabilities spirv_caps = { + .Addresses = true, + .Float16 = true, + .Float64 = true, + .Groups = true, + .StorageImageWriteWithoutFormat = true, + .Int8 = true, + .Int16 = true, + .Int64 = true, + .Int64Atomics = true, + .Kernel = true, + .Linkage = true, /* We receive linked kernel from clc */ + .DenormFlushToZero = true, + .DenormPreserve = true, + .SignedZeroInfNanPreserve = true, + .RoundingModeRTE = true, + .RoundingModeRTZ = true, + .GenericPointer = true, + .GroupNonUniform = true, + .GroupNonUniformArithmetic = true, + .GroupNonUniformClustered = true, + .GroupNonUniformBallot = true, + .GroupNonUniformQuad = true, + .GroupNonUniformShuffle = true, + .GroupNonUniformVote = true, + .SubgroupDispatch = true, + }; struct spirv_to_nir_options spirv_options = { .environment = NIR_SPIRV_OPENCL, .capabilities = &spirv_caps, @@ -197,163 +100,79 @@ elk_nir_from_spirv(void *mem_ctx, unsigned gfx_version, const uint32_t *spirv, ralloc_steal(mem_ctx, nir); nir->info.name = ralloc_strdup(nir, "library"); - if (INTEL_DEBUG(DEBUG_CS)) { - /* Re-index SSA defs so we print more sensible numbers. */ - nir_foreach_function_impl(impl, nir) { - nir_index_ssa_defs(impl); - } + nir_fixup_is_exported(nir); - fprintf(stderr, "NIR (from SPIR-V) for kernel\n"); - nir_print_shader(nir, stderr); - } + NIR_PASS(_, nir, nir_lower_system_values); + NIR_PASS(_, nir, nir_lower_calls_to_builtins); - nir_lower_printf_options printf_opts = { - .ptr_bit_size = 64, - .use_printf_base_identifier = true, - }; - NIR_PASS_V(nir, nir_lower_printf, &printf_opts); + NIR_PASS_V(nir, nir_lower_printf, &(const struct nir_lower_printf_options) { + .ptr_bit_size = 64, + .use_printf_base_identifier = true, + }); - NIR_PASS_V(nir, nir_link_shader_functions, spirv_options.clc_shader); + NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_function_temp); + NIR_PASS(_, nir, nir_lower_returns); + NIR_PASS(_, nir, nir_inline_functions); + //nir_remove_non_exported(nir); + NIR_PASS(_, nir, nir_copy_prop); + NIR_PASS(_, nir, nir_opt_deref); - /* We have to lower away local constant initializers right before we - * inline functions. That way they get properly initialized at the top - * of the function and not at the top of its caller. + /* We can't deal with constant data, get rid of it */ + nir_lower_constant_to_temp(nir); + + /* We can go ahead and lower the rest of the constant initializers. We do + * this here so that nir_remove_dead_variables and split_per_member_structs + * below see the corresponding stores. */ - NIR_PASS_V(nir, nir_lower_variable_initializers, ~(nir_var_shader_temp | - nir_var_function_temp)); - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo | - nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL); - { - bool progress; - do - { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_copy_prop_vars); - NIR_PASS(progress, nir, nir_opt_deref); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_undef); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_opt_algebraic); - } while (progress); - } + NIR_PASS(_, nir, nir_lower_variable_initializers, ~0); - NIR_PASS_V(nir, nir_lower_variable_initializers, nir_var_function_temp); - NIR_PASS_V(nir, nir_lower_returns); - NIR_PASS_V(nir, nir_inline_functions); - - assert(nir->scratch_size == 0); - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_cl_type_size_align); - - { - bool progress; - do - { - progress = false; - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_copy_prop_vars); - NIR_PASS(progress, nir, nir_opt_deref); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_opt_undef); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - NIR_PASS(progress, nir, nir_split_var_copies); - NIR_PASS(progress, nir, nir_lower_var_copies); - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_opt_algebraic); - NIR_PASS(progress, nir, nir_opt_if, nir_opt_if_optimize_phi_true_false); - NIR_PASS(progress, nir, nir_opt_dead_cf); - NIR_PASS(progress, nir, nir_opt_remove_phis); - NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true); - NIR_PASS(progress, nir, nir_lower_vec3_to_vec4, nir_var_mem_generic | nir_var_uniform); - NIR_PASS(progress, nir, nir_opt_memcpy); - } while (progress); - } - - NIR_PASS_V(nir, nir_scale_fdiv); - - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_uniform | nir_var_mem_ubo | - nir_var_mem_constant | nir_var_function_temp | nir_var_image, NULL); - - - NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_mem_shared | nir_var_function_temp, NULL); - - nir->scratch_size = 0; - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, - nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | - nir_var_mem_global | nir_var_mem_constant, - glsl_get_cl_type_size_align); - - // Lower memcpy - needs to wait until types are sized - { - bool progress; - do { - progress = false; - NIR_PASS(progress, nir, nir_opt_memcpy); - NIR_PASS(progress, nir, nir_copy_prop); - NIR_PASS(progress, nir, nir_opt_copy_prop_vars); - NIR_PASS(progress, nir, nir_opt_deref); - NIR_PASS(progress, nir, nir_opt_dce); - NIR_PASS(progress, nir, nir_split_var_copies); - NIR_PASS(progress, nir, nir_lower_var_copies); - NIR_PASS(progress, nir, nir_lower_vars_to_ssa); - NIR_PASS(progress, nir, nir_opt_constant_folding); - NIR_PASS(progress, nir, nir_opt_cse); - } while (progress); - } - NIR_PASS_V(nir, nir_lower_memcpy); - - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_mem_shared | nir_var_function_temp | nir_var_shader_temp | nir_var_uniform, - nir_address_format_32bit_offset_as_64bit); - - NIR_PASS_V(nir, nir_lower_system_values); - - /* Hopefully we can drop this once lower_vars_to_ssa has improved to not - * lower everything to scratch. + /* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B + * aligned and so it can just read/write them as vec4s. This results in a + * LOT of vec4->vec3 casts on loads and stores. One solution to this + * problem is to get rid of all vec3 variables. */ - if (llvm17_wa) - cleanup_llvm17_scratch(nir); + NIR_PASS(_, nir, nir_lower_vec3_to_vec4, + nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared | + nir_var_mem_global | nir_var_mem_constant); + + /* We assign explicit types early so that the optimizer can take advantage + * of that information and hopefully get rid of some of our memcpys. + */ + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, + nir_var_uniform | nir_var_shader_temp | nir_var_function_temp | + nir_var_mem_shared | nir_var_mem_global, + glsl_get_cl_type_size_align); + + optimize(nir); + + NIR_PASS(_, nir, nir_remove_dead_variables, nir_var_all, NULL); /* Lower again, this time after dead-variables to get more compact variable * layouts. */ - nir->global_mem_size = 0; - nir->scratch_size = 0; - nir->info.shared_size = 0; - NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, - nir_var_mem_shared | nir_var_mem_global | nir_var_mem_constant, - glsl_get_cl_type_size_align); - if (nir->constant_data_size > 0) { - assert(nir->constant_data == NULL); - nir->constant_data = rzalloc_size(nir, nir->constant_data_size); - nir_gather_explicit_io_initializers(nir, nir->constant_data, - nir->constant_data_size, - nir_var_mem_constant); - } + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, + nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared | + nir_var_mem_global | nir_var_mem_constant, + glsl_get_cl_type_size_align); + assert(nir->constant_data_size == 0); - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_constant, - nir_address_format_64bit_global); + NIR_PASS(_, nir, nir_lower_memcpy); - NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_uniform, - nir_address_format_32bit_offset_as_64bit); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_constant, + nir_address_format_64bit_global); - NIR_PASS_V(nir, nir_lower_explicit_io, - nir_var_shader_temp | nir_var_function_temp | - nir_var_mem_shared | nir_var_mem_global, - nir_address_format_62bit_generic); + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_uniform, + nir_address_format_64bit_global); - if (INTEL_DEBUG(DEBUG_CS)) { - /* Re-index SSA defs so we print more sensible numbers. */ - nir_foreach_function_impl(impl, nir) { - nir_index_ssa_defs(impl); - } + /* Note: we cannot lower explicit I/O here, because we need derefs in tact + * for function calls into the library to work. + */ - fprintf(stderr, "NIR (before I/O lowering) for kernel\n"); - nir_print_shader(nir, stderr); - } + NIR_PASS(_, nir, nir_lower_convert_alu_types, NULL); + NIR_PASS(_, nir, nir_opt_if, 0); + NIR_PASS(_, nir, nir_opt_idiv_const, 16); + + optimize(nir); return nir; } diff --git a/src/intel/vulkan/anv_internal_kernels.c b/src/intel/vulkan/anv_internal_kernels.c index 6a6db4c9795..3b5ca1a6dd2 100644 --- a/src/intel/vulkan/anv_internal_kernels.c +++ b/src/intel/vulkan/anv_internal_kernels.c @@ -56,7 +56,7 @@ load_libanv(struct anv_device *device) void *mem_ctx = ralloc_context(NULL); - return brw_nir_from_spirv(mem_ctx, device->info->ver, spv_code, spv_size, true); + return brw_nir_from_spirv(mem_ctx, spv_code, spv_size); } static void @@ -73,6 +73,7 @@ link_libanv(nir_shader *nir, const nir_shader *libanv) nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared | nir_var_mem_global, nir_address_format_62bit_generic); + NIR_PASS_V(nir, nir_lower_scratch_to_var); } static struct anv_shader_bin *