mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-29 22:48:12 +02:00
Mali hardware handles a bunch of OOB conditions by using addresses with the top bit set. When the top bit is set, any load/store from a shader will treat the address as OOB and read zero and discard writes. We can use this to implement ro_sink_address_poly. Signed-off-by: Faith Ekstrand <faith.ekstrand@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38547>
490 lines
16 KiB
C
490 lines
16 KiB
C
/*
|
|
* Copyright 2024 Collabora Ltd
|
|
* Copyright 2023 Alyssa Rosenzweig
|
|
* Copyright 2020 Intel Corporation
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "compiler/glsl_types.h"
|
|
#include "compiler/spirv/nir_spirv.h"
|
|
#include "panfrost/compiler/bifrost/bifrost_compile.h"
|
|
#include "panfrost/compiler/pan_compiler.h"
|
|
#include "panfrost/compiler/pan_nir.h"
|
|
#include "nir.h"
|
|
#include "nir_builder.h"
|
|
#include "nir_builder_opcodes.h"
|
|
#include "nir_intrinsics.h"
|
|
#include "nir_precompiled.h"
|
|
#include "pan_shader.h"
|
|
#include "shader_enums.h"
|
|
|
|
#include <fcntl.h>
|
|
#include <inttypes.h>
|
|
#include <libgen.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
#include "util/macros.h"
|
|
#include "util/u_dynarray.h"
|
|
#include "util/u_printf.h"
|
|
#include <sys/mman.h>
|
|
|
|
static const struct spirv_to_nir_options spirv_options = {
|
|
.environment = NIR_SPIRV_OPENCL,
|
|
.shared_addr_format = nir_address_format_62bit_generic,
|
|
.global_addr_format = nir_address_format_62bit_generic,
|
|
.temp_addr_format = nir_address_format_62bit_generic,
|
|
.constant_addr_format = nir_address_format_64bit_global,
|
|
.create_library = true,
|
|
.printf = true,
|
|
};
|
|
|
|
static const nir_shader_compiler_options *
|
|
get_compiler_options(unsigned arch)
|
|
{
|
|
if (arch >= 11)
|
|
return &bifrost_nir_options_v11;
|
|
|
|
if (arch >= 9)
|
|
return &bifrost_nir_options_v9;
|
|
|
|
return &bifrost_nir_options_v6;
|
|
}
|
|
|
|
/* Standard optimization loop */
|
|
static void
|
|
optimize(nir_shader *nir)
|
|
{
|
|
bool progress;
|
|
do {
|
|
progress = false;
|
|
|
|
NIR_PASS(progress, nir, nir_split_var_copies);
|
|
NIR_PASS(progress, nir, nir_split_struct_vars, nir_var_function_temp);
|
|
NIR_PASS(progress, nir, nir_lower_var_copies);
|
|
NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
|
|
|
|
NIR_PASS(progress, nir, nir_opt_copy_prop);
|
|
NIR_PASS(progress, nir, nir_opt_remove_phis);
|
|
NIR_PASS(progress, nir, nir_lower_all_phis_to_scalar);
|
|
NIR_PASS(progress, nir, nir_opt_dce);
|
|
NIR_PASS(progress, nir, nir_opt_dead_cf);
|
|
NIR_PASS(progress, nir, nir_opt_cse);
|
|
|
|
nir_opt_peephole_select_options peephole_select_options = {
|
|
.limit = 64,
|
|
.expensive_alu_ok = true,
|
|
};
|
|
NIR_PASS(progress, nir, nir_opt_peephole_select,
|
|
&peephole_select_options);
|
|
NIR_PASS(progress, nir, nir_opt_phi_precision);
|
|
NIR_PASS(progress, nir, nir_opt_algebraic);
|
|
NIR_PASS(progress, nir, nir_opt_constant_folding);
|
|
|
|
NIR_PASS(progress, nir, nir_opt_deref);
|
|
NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
|
|
NIR_PASS(progress, nir, nir_opt_undef);
|
|
NIR_PASS(progress, nir, nir_lower_undef_to_zero);
|
|
|
|
NIR_PASS(progress, nir, nir_opt_shrink_vectors, true);
|
|
NIR_PASS(progress, nir, nir_opt_loop_unroll);
|
|
|
|
} while (progress);
|
|
}
|
|
|
|
static nir_shader *
|
|
compile(void *memctx, const uint32_t *spirv, size_t spirv_size, unsigned arch)
|
|
{
|
|
const nir_shader_compiler_options *nir_options = get_compiler_options(arch);
|
|
|
|
nir_shader *nir =
|
|
spirv_to_nir(spirv, spirv_size / 4, NULL, 0, MESA_SHADER_KERNEL,
|
|
"library", &spirv_options, nir_options);
|
|
nir_validate_shader(nir, "after spirv_to_nir");
|
|
nir_validate_ssa_dominance(nir, "after spirv_to_nir");
|
|
ralloc_steal(memctx, nir);
|
|
|
|
nir_fixup_is_exported(nir);
|
|
|
|
NIR_PASS(_, nir, nir_lower_system_values);
|
|
NIR_PASS(_, nir, nir_lower_calls_to_builtins);
|
|
|
|
nir_lower_compute_system_values_options cs = {.global_id_is_32bit = true};
|
|
NIR_PASS(_, nir, nir_lower_compute_system_values, &cs);
|
|
|
|
NIR_PASS(_, nir, nir_lower_printf,
|
|
&(const struct nir_lower_printf_options){
|
|
.hash_format_strings = true,
|
|
});
|
|
|
|
/* We have to lower away local constant initializers right before we
|
|
* inline functions. That way they get properly initialized at the top
|
|
* of the function and not at the top of its caller.
|
|
*/
|
|
NIR_PASS(_, nir, nir_lower_variable_initializers, nir_var_function_temp);
|
|
NIR_PASS(_, nir, nir_lower_returns);
|
|
NIR_PASS(_, nir, nir_inline_functions);
|
|
nir_remove_non_exported(nir);
|
|
NIR_PASS(_, nir, nir_opt_copy_prop);
|
|
NIR_PASS(_, nir, nir_opt_deref);
|
|
|
|
/* We can't deal with constant data, get rid of it */
|
|
nir_lower_constant_to_temp(nir);
|
|
|
|
/* We can go ahead and lower the rest of the constant initializers. We do
|
|
* this here so that nir_remove_dead_variables and split_per_member_structs
|
|
* below see the corresponding stores.
|
|
*/
|
|
NIR_PASS(_, nir, nir_lower_variable_initializers, ~0);
|
|
|
|
/* LLVM loves take advantage of the fact that vec3s in OpenCL are 16B
|
|
* aligned and so it can just read/write them as vec4s. This results in a
|
|
* LOT of vec4->vec3 casts on loads and stores. One solution to this
|
|
* problem is to get rid of all vec3 variables.
|
|
*/
|
|
NIR_PASS(_, nir, nir_lower_vec3_to_vec4,
|
|
nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared |
|
|
nir_var_mem_global | nir_var_mem_constant);
|
|
|
|
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_uniform,
|
|
glsl_get_cl_type_size_align);
|
|
|
|
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_uniform,
|
|
nir_address_format_32bit_offset_as_64bit);
|
|
|
|
return nir;
|
|
}
|
|
|
|
static nir_def *
|
|
load_sysval_from_push_const(nir_builder *b, unsigned offset, unsigned bit_size,
|
|
unsigned num_comps)
|
|
{
|
|
return nir_load_push_constant(
|
|
b, num_comps, bit_size,
|
|
nir_imm_int(b, BIFROST_PRECOMPILED_KERNEL_SYSVALS_OFFSET + offset));
|
|
}
|
|
|
|
static bool
|
|
lower_sysvals(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_data)
|
|
{
|
|
const nir_shader *shader = b->shader;
|
|
|
|
unsigned num_comps = intr->def.num_components;
|
|
unsigned bit_size = intr->def.bit_size;
|
|
nir_def *val = NULL;
|
|
b->cursor = nir_before_instr(&intr->instr);
|
|
|
|
switch (intr->intrinsic) {
|
|
case nir_intrinsic_load_base_workgroup_id:
|
|
/* The base is always 0 */
|
|
val = nir_imm_zero(b, num_comps, bit_size);
|
|
break;
|
|
case nir_intrinsic_load_workgroup_size:
|
|
/* We are never expecting the local size to be variable */
|
|
assert(!shader->info.workgroup_size_variable);
|
|
val = nir_vec3(b, nir_imm_int(b, shader->info.workgroup_size[0]),
|
|
nir_imm_int(b, shader->info.workgroup_size[1]),
|
|
nir_imm_int(b, shader->info.workgroup_size[2]));
|
|
break;
|
|
|
|
case nir_intrinsic_load_num_workgroups:
|
|
val = load_sysval_from_push_const(
|
|
b, offsetof(struct bifrost_precompiled_kernel_sysvals, num_workgroups),
|
|
bit_size, num_comps);
|
|
break;
|
|
|
|
case nir_intrinsic_load_ro_sink_address_poly:
|
|
/* Any address with the top bit set is treated as OOB by the hardware
|
|
* and any reads return zero.
|
|
*/
|
|
val = nir_imm_int64(b, PAN_SHADER_OOB_ADDRESS);
|
|
break;
|
|
|
|
case nir_intrinsic_load_printf_buffer_address:
|
|
val = load_sysval_from_push_const(
|
|
b,
|
|
offsetof(struct bifrost_precompiled_kernel_sysvals,
|
|
printf_buffer_address),
|
|
bit_size, num_comps);
|
|
break;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
b->cursor = nir_after_instr(&intr->instr);
|
|
nir_def_replace(&intr->def, val);
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
print_shader(FILE *fp, const char *name, const char *suffix, uint32_t variant,
|
|
nir_shader *nir, struct pan_shader_info *shader_info,
|
|
struct util_dynarray *binary)
|
|
{
|
|
struct bifrost_precompiled_kernel_info info =
|
|
bifrost_precompiled_pack_kernel_info(nir, shader_info, binary);
|
|
size_t sz_B = sizeof(info) + binary->size;
|
|
size_t sz_el = DIV_ROUND_UP(sz_B, 4);
|
|
uint32_t *mem = calloc(sz_el, 4);
|
|
|
|
memcpy(mem, &info, sizeof(info));
|
|
memcpy((uint8_t *)mem + sizeof(info), binary->data, binary->size);
|
|
|
|
nir_precomp_print_blob(fp, name, suffix, variant, mem, sz_B, true);
|
|
free(mem);
|
|
}
|
|
|
|
static nir_def *
|
|
load_kernel_input(nir_builder *b, unsigned num_components, unsigned bit_size,
|
|
unsigned offset_B)
|
|
{
|
|
return nir_load_push_constant(
|
|
b, num_components, bit_size,
|
|
nir_imm_int(b, BIFROST_PRECOMPILED_KERNEL_ARGS_OFFSET + offset_B));
|
|
}
|
|
|
|
/* Always assume default as we generate per gen already */
|
|
static const char *
|
|
remap_variant(nir_function *func, unsigned variant, const char *target)
|
|
{
|
|
return "default";
|
|
}
|
|
|
|
void pan_shader_compile(nir_shader *nir, struct pan_compile_inputs *inputs,
|
|
struct util_dynarray *binary,
|
|
struct pan_shader_info *info);
|
|
|
|
int
|
|
main(int argc, const char **argv)
|
|
{
|
|
if (argc != 6) {
|
|
fprintf(
|
|
stderr,
|
|
"Usage: %s [library name] [arch] [input spir-v] [output header] [output C]\n",
|
|
argv[0]);
|
|
return 1;
|
|
}
|
|
|
|
const char *library_name = argv[1];
|
|
const char *target_arch_str = argv[2];
|
|
const char *input_spirv_path = argv[3];
|
|
const char *output_h_path = argv[4];
|
|
const char *output_c_path = argv[5];
|
|
|
|
unsigned target_arch = atoi(target_arch_str);
|
|
|
|
if (target_arch < 4 || target_arch > 13) {
|
|
fprintf(stderr, "Unsupported target arch %d\n", target_arch);
|
|
return 1;
|
|
}
|
|
|
|
void *mem_ctx = ralloc_context(NULL);
|
|
if (mem_ctx == NULL) {
|
|
fprintf(stderr, "mem_ctx allocation failed\n");
|
|
goto err_out;
|
|
}
|
|
|
|
int fd = open(input_spirv_path, O_RDONLY);
|
|
if (fd < 0) {
|
|
fprintf(stderr, "Failed to open %s\n", input_spirv_path);
|
|
goto input_spirv_open_failed;
|
|
}
|
|
|
|
off_t spirv_len = lseek(fd, 0, SEEK_END);
|
|
if (spirv_len < 0) {
|
|
fprintf(stderr, "Failed to lseek to the end of the file: errno=%d, %s\n",
|
|
errno, strerror(errno));
|
|
close(fd);
|
|
goto input_spirv_open_failed;
|
|
}
|
|
|
|
const void *spirv_map = mmap(NULL, spirv_len, PROT_READ, MAP_PRIVATE, fd, 0);
|
|
close(fd);
|
|
|
|
if (spirv_map == MAP_FAILED) {
|
|
fprintf(stderr, "Failed to mmap the file: errno=%d, %s\n", errno,
|
|
strerror(errno));
|
|
goto input_spirv_open_failed;
|
|
}
|
|
|
|
FILE *fp_h = fopen(output_h_path, "w");
|
|
if (fp_h == NULL) {
|
|
fprintf(stderr, "Failed to open %s for writting\n", output_h_path);
|
|
goto input_spirv_open_failed;
|
|
}
|
|
|
|
FILE *fp_c = fopen(output_c_path, "w");
|
|
if (fp_c == NULL) {
|
|
fprintf(stderr, "Failed to open %s for writting\n", output_c_path);
|
|
goto fp_c_open_failed;
|
|
}
|
|
|
|
glsl_type_singleton_init_or_ref();
|
|
u_printf_singleton_init_or_ref();
|
|
|
|
/* POSIX basename can modify the content of the path */
|
|
char *tmp_out_h_path = strdup(output_h_path);
|
|
const char *output_h_file_name = basename(tmp_out_h_path);
|
|
nir_precomp_print_header(fp_c, fp_h, "Collabora Ltd", output_h_file_name);
|
|
free(tmp_out_h_path);
|
|
|
|
nir_shader *nir = compile(mem_ctx, spirv_map, spirv_len, target_arch);
|
|
|
|
/* load_preamble works at 32-bit granularity */
|
|
struct nir_precomp_opts opt = {.arg_align_B = 4};
|
|
|
|
nir_foreach_entrypoint(libfunc, nir) {
|
|
if (target_arch < 6) {
|
|
fprintf(
|
|
stderr,
|
|
"ERROR: Attempting to compile entrypoint %s on Midgard, this is unsupported!\n",
|
|
libfunc->name);
|
|
goto invalid_precomp;
|
|
}
|
|
|
|
unsigned nr_vars = nir_precomp_nr_variants(libfunc);
|
|
|
|
nir_precomp_print_layout_struct(fp_h, &opt, libfunc);
|
|
|
|
for (unsigned v = 0; v < nr_vars; ++v) {
|
|
nir_shader *s = nir_precompiled_build_variant(
|
|
libfunc, MESA_SHADER_COMPUTE, v, get_compiler_options(target_arch),
|
|
&opt, load_kernel_input);
|
|
|
|
unsigned gpu_prod_id = (target_arch & 0xf) << 12;
|
|
|
|
struct pan_compile_inputs inputs = {
|
|
.gpu_id = gpu_prod_id << 16,
|
|
.gpu_variant = 0,
|
|
};
|
|
|
|
nir_link_shader_functions(s, nir);
|
|
NIR_PASS(_, s, nir_inline_functions);
|
|
nir_remove_non_entrypoints(s);
|
|
NIR_PASS(_, s, nir_opt_deref);
|
|
NIR_PASS(_, s, nir_lower_vars_to_ssa);
|
|
NIR_PASS(_, s, nir_remove_dead_derefs);
|
|
NIR_PASS(_, s, nir_remove_dead_variables,
|
|
nir_var_function_temp | nir_var_shader_temp, NULL);
|
|
|
|
/* We assign explicit types early so that the optimizer can take
|
|
* advantage of that information and hopefully get rid of some of our
|
|
* memcpys.
|
|
*/
|
|
NIR_PASS(_, s, nir_lower_vars_to_explicit_types,
|
|
nir_var_shader_temp | nir_var_function_temp |
|
|
nir_var_mem_shared | nir_var_mem_global,
|
|
glsl_get_cl_type_size_align);
|
|
|
|
optimize(s);
|
|
|
|
NIR_PASS(_, s, nir_remove_dead_variables, nir_var_all, NULL);
|
|
|
|
/* Lower again, this time after dead-variables to get more compact
|
|
* variable layouts.
|
|
*/
|
|
NIR_PASS(_, s, nir_lower_vars_to_explicit_types,
|
|
nir_var_shader_temp | nir_var_function_temp |
|
|
nir_var_mem_shared | nir_var_mem_global |
|
|
nir_var_mem_constant,
|
|
glsl_get_cl_type_size_align);
|
|
assert(nir->constant_data_size == 0);
|
|
|
|
NIR_PASS(_, s, nir_lower_memcpy);
|
|
|
|
NIR_PASS(_, s, nir_lower_explicit_io, nir_var_mem_constant,
|
|
nir_address_format_64bit_global);
|
|
|
|
NIR_PASS(_, s, nir_lower_convert_alu_types, NULL);
|
|
NIR_PASS(_, s, nir_opt_if, 0);
|
|
NIR_PASS(_, s, nir_opt_idiv_const, 16);
|
|
|
|
/* Lower explicit IO here to ensure that we will not clash with
|
|
* different address formats inside shaders */
|
|
NIR_PASS(_, s, nir_opt_deref);
|
|
NIR_PASS(_, s, nir_lower_vars_to_ssa);
|
|
NIR_PASS(_, s, nir_lower_explicit_io,
|
|
nir_var_shader_temp | nir_var_function_temp |
|
|
nir_var_mem_shared | nir_var_mem_global,
|
|
nir_address_format_62bit_generic);
|
|
|
|
optimize(s);
|
|
|
|
NIR_PASS(_, s, nir_lower_vars_to_explicit_types,
|
|
nir_var_shader_temp | nir_var_function_temp,
|
|
glsl_get_cl_type_size_align);
|
|
|
|
NIR_PASS(_, s, nir_lower_vars_to_explicit_types, nir_var_mem_shared,
|
|
glsl_get_cl_type_size_align);
|
|
|
|
/* Unroll loops before lowering indirects */
|
|
bool progress = false;
|
|
do {
|
|
progress = false;
|
|
NIR_PASS(progress, s, nir_opt_loop);
|
|
} while (progress);
|
|
|
|
pan_preprocess_nir(s, inputs.gpu_id);
|
|
pan_nir_lower_texture_early(s, inputs.gpu_id);
|
|
pan_postprocess_nir(s, inputs.gpu_id);
|
|
pan_nir_lower_texture_late(s, inputs.gpu_id);
|
|
|
|
NIR_PASS(_, s, nir_opt_deref);
|
|
NIR_PASS(_, s, nir_lower_vars_to_ssa);
|
|
NIR_PASS(_, s, nir_lower_explicit_io,
|
|
nir_var_shader_temp | nir_var_function_temp |
|
|
nir_var_mem_shared | nir_var_mem_global,
|
|
nir_address_format_62bit_generic);
|
|
|
|
NIR_PASS(_, s, nir_shader_intrinsics_pass, lower_sysvals,
|
|
nir_metadata_control_flow, NULL);
|
|
|
|
nir_shader *clone = nir_shader_clone(NULL, s);
|
|
|
|
struct util_dynarray shader_binary;
|
|
struct pan_shader_info shader_info = {0};
|
|
shader_binary = UTIL_DYNARRAY_INIT;
|
|
pan_shader_compile(clone, &inputs, &shader_binary, &shader_info);
|
|
|
|
assert(shader_info.push.count * 4 <=
|
|
BIFROST_PRECOMPILED_KERNEL_ARGS_SIZE &&
|
|
"Too many kernel arguments!");
|
|
|
|
print_shader(fp_c, libfunc->name, "default", v, s, &shader_info,
|
|
&shader_binary);
|
|
util_dynarray_fini(&shader_binary);
|
|
ralloc_free(clone);
|
|
|
|
ralloc_free(s);
|
|
}
|
|
}
|
|
|
|
nir_precomp_print_program_enum(fp_h, nir, library_name);
|
|
nir_precomp_print_dispatch_macros(fp_h, &opt, nir);
|
|
|
|
char target_name[12];
|
|
snprintf(target_name, sizeof(target_name), "default_v%d", target_arch);
|
|
nir_precomp_print_extern_binary_map(fp_h, library_name, target_name);
|
|
nir_precomp_print_binary_map(fp_c, nir, library_name, target_name,
|
|
remap_variant);
|
|
|
|
u_printf_singleton_decref();
|
|
glsl_type_singleton_decref();
|
|
fclose(fp_c);
|
|
fclose(fp_h);
|
|
ralloc_free(mem_ctx);
|
|
|
|
return 0;
|
|
|
|
invalid_precomp:
|
|
u_printf_singleton_decref();
|
|
glsl_type_singleton_decref();
|
|
fp_c_open_failed:
|
|
fclose(fp_h);
|
|
input_spirv_open_failed:
|
|
ralloc_free(mem_ctx);
|
|
err_out:
|
|
return 1;
|
|
}
|