diff --git a/src/panfrost/compiler/bifrost/bifrost_nir.c b/src/panfrost/compiler/bifrost/bifrost_nir.c index 0f2b3dd18c8..87166cce104 100644 --- a/src/panfrost/compiler/bifrost/bifrost_nir.c +++ b/src/panfrost/compiler/bifrost/bifrost_nir.c @@ -826,9 +826,6 @@ mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes, }; } -static void bi_lower_texture_nir(nir_shader *nir, uint64_t gpu_id); -static void bi_lower_texture_late_nir(nir_shader *nir, uint64_t gpu_id); - static bool nir_shader_has_local_variables(const nir_shader *nir) { @@ -840,6 +837,11 @@ nir_shader_has_local_variables(const nir_shader *nir) return false; } +static bool pan_nir_lower_texel_buffer_fetch(nir_shader *nir, unsigned arch); +static bool pan_nir_lower_buf_image_access(nir_shader *nir, unsigned arch); +static bool bi_should_idvs(nir_shader *nir, const struct pan_compile_inputs *inputs); +static bool bifrost_nir_lower_vs_atomics(nir_shader *nir); + void bifrost_postprocess_nir(nir_shader *nir, const struct pan_compile_inputs *inputs, @@ -850,6 +852,15 @@ bifrost_postprocess_nir(nir_shader *nir, const uint64_t gpu_id = inputs->gpu_id; const unsigned gpu_arch = pan_arch(gpu_id); + NIR_PASS(_, nir, nir_lower_image_atomics_to_global, NULL, NULL); + + /* on Bifrost, lower MSAA load/stores to 3D load/stores */ + if (gpu_arch < 9) + NIR_PASS(_, nir, pan_nir_lower_image_ms); + + NIR_PASS(_, nir, pan_nir_lower_texel_buffer_fetch, gpu_arch); + NIR_PASS(_, nir, pan_nir_lower_buf_image_access, gpu_arch); + /* We assume that UBO and SSBO were lowered, let's move things around. */ nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo | nir_move_comparisons | nir_move_copies | @@ -858,20 +869,78 @@ bifrost_postprocess_nir(nir_shader *nir, NIR_PASS(_, nir, nir_opt_sink, move_all); NIR_PASS(_, nir, nir_opt_move, move_all); - bi_lower_texture_nir(nir, gpu_id); + /* The varying layout (if any) may have different bit sizes for some + * varyings than we have in the shader. For descriptors, this isn't a + * problem as it's handled by the descriptor layout. However, for direct + * loads and stores on Valhall+, we need the right bit sizes in the shader. + * We could do this in the back-end as we emit but it's easier for now to + * lower in NIR. This also handles the case where we do a load from the + * fragment shader of something that isn't written by the vertex shader. + * In that case, we just return zero. + */ + if (pan_arch(inputs->gpu_id) >= 9 && inputs->varying_layout) + NIR_PASS(_, nir, pan_nir_resize_varying_io, inputs->varying_layout); if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, nir_lower_is_helper_invocation); + NIR_PASS(_, nir, pan_nir_lower_helper_invocation); + NIR_PASS(_, nir, pan_nir_lower_sample_pos); NIR_PASS(_, nir, pan_nir_lower_noperspective_fs); + NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord); + NIR_PASS(_, nir, pan_nir_lower_var_special_pan); + /* TODO: should we do this in VS too? Should we do this earlier? */ NIR_PASS(_, nir, nir_lower_mediump_io, nir_var_shader_in | nir_var_shader_out, ~bi_fp32_varying_mask(nir), false); NIR_PASS(_, nir, bifrost_nir_lower_load_output); + + /* Collect format varyings */ + pan_varying_collect_formats(&info->varyings.formats, + nir, inputs->gpu_id, + inputs->trust_varying_flat_highp_types, + false /* lower mediump */); + + /* TODO: This can go in lower_noperspective_fs */ + info->varyings.noperspective = + pan_nir_collect_noperspective_varyings_fs(nir); + + if (!inputs->is_blend) + NIR_PASS(_, nir, pan_nir_lower_fs_inputs, inputs->gpu_id, + inputs->varying_layout, info); + + /* Blit shaders may not need to run ATEST, since ATEST is not needed if + * early-z is forced, alpha-to-coverage is disabled, and there are no + * writes to the coverage mask. The latter two are satisfied for all + * blit shaders, so we just care about early-z, which blit shaders force + * iff they do not write depth or stencil + */ + const bool emit_zs = + nir->info.outputs_written & (BITFIELD_BIT(FRAG_RESULT_DEPTH) | + BITFIELD_BIT(FRAG_RESULT_STENCIL)); + const bool skip_atest = inputs->is_blit && !emit_zs; + NIR_PASS(_, nir, pan_nir_lower_fs_outputs, skip_atest); } else if (nir->info.stage == MESA_SHADER_VERTEX) { NIR_PASS(_, nir, nir_lower_viewport_transform); NIR_PASS(_, nir, nir_lower_point_size, 1.0, 0.0); + + /* Copy varying format & Layout */ + assert(inputs->varying_layout); + memcpy(&info->varyings.formats, inputs->varying_layout, + sizeof(*inputs->varying_layout)); + + info->vs.idvs = bi_should_idvs(nir, inputs); + + if (info->vs.idvs && nir->info.writes_memory) + NIR_PASS(_, nir, bifrost_nir_lower_vs_atomics); + + /* Needs to run after lower_vs_atomics as it inserts operations between + * ssbo_atomic and store_output */ NIR_PASS(_, nir, pan_nir_lower_noperspective_vs); + NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id, + inputs->varying_layout, info->vs.idvs, + &info->vs.needs_extended_fifo); } /* Our OpenCL compiler (src/panfrost/clc/pan_compile.c) has a very weird and @@ -937,73 +1006,62 @@ bifrost_postprocess_nir(nir_shader *nir, * we can implement reductions and scans on f16vec2 values without splitting * to scalar first. */ + const nir_lower_subgroups_options lower_subgroup_opts = { + .subgroup_size = pan_subgroup_size(gpu_arch), + .ballot_bit_size = 32, + .ballot_components = 1, + .lower_to_scalar = true, + .lower_vote_feq = true, + .lower_vote_ieq = true, + .lower_vote_bool_eq = true, + .lower_first_invocation_to_ballot = true, + .lower_read_first_invocation = true, + .lower_subgroup_masks = true, + .lower_relative_shuffle = true, + .lower_shuffle = true, + .lower_quad = true, + .lower_quad_broadcast_dynamic = true, + .lower_quad_vote = true, + .lower_elect = true, + .lower_rotate_to_shuffle = true, + .lower_rotate_clustered_to_shuffle = true, + .lower_inverse_ballot = true, + .lower_reduce = true, + .lower_boolean_reduce = true, + .lower_boolean_shuffle = true, + }; bool lower_subgroups_progress = false; NIR_PASS(lower_subgroups_progress, nir, nir_lower_subgroups, - &(nir_lower_subgroups_options) { - .subgroup_size = pan_subgroup_size(pan_arch(gpu_id)), - .ballot_bit_size = 32, - .ballot_components = 1, - .lower_to_scalar = true, - .lower_vote_feq = true, - .lower_vote_ieq = true, - .lower_vote_bool_eq = true, - .lower_first_invocation_to_ballot = true, - .lower_read_first_invocation = true, - .lower_subgroup_masks = true, - .lower_relative_shuffle = true, - .lower_shuffle = true, - .lower_quad = true, - .lower_quad_broadcast_dynamic = true, - .lower_quad_vote = true, - .lower_elect = true, - .lower_rotate_to_shuffle = true, - .lower_rotate_clustered_to_shuffle = true, - .lower_inverse_ballot = true, - .lower_reduce = true, - .lower_boolean_reduce = true, - .lower_boolean_shuffle = true, - }); - /* nir_lower_subgroups creates new vars, clean them up. */ + &lower_subgroup_opts); + /* lower_subgroups creates vars, clean them up before lower_64bit_phis */ if (lower_subgroups_progress) NIR_PASS(_, nir, nir_lower_vars_to_ssa); NIR_PASS(_, nir, nir_shader_intrinsics_pass, bi_lower_subgroups, - nir_metadata_control_flow, (void *) &gpu_id); + nir_metadata_control_flow, (void *) &gpu_id); + /* Lower constant idiv before we lower 64-bit integers */ + NIR_PASS(_, nir, nir_opt_idiv_const, 8); + + /* Lower 64-bit integers */ NIR_PASS(_, nir, nir_lower_64bit_phis); NIR_PASS(_, nir, nir_lower_int64); + + const nir_lower_idiv_options lower_idiv_opts = { + .allow_fp16 = true, + }; + NIR_PASS(_, nir, nir_lower_idiv, &lower_idiv_opts); + + NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */); + NIR_PASS(_, nir, nir_lower_alu); /* Lower [iu]mul_high */ + + /* Lower bit sizes and vector widths */ NIR_PASS(_, nir, nir_lower_bit_size, bi_lower_bit_size, (void *) &gpu_id); - - NIR_PASS(_, nir, nir_opt_idiv_const, 8); - NIR_PASS(_, nir, nir_lower_idiv, - &(nir_lower_idiv_options){.allow_fp16 = true}); - NIR_PASS(_, nir, nir_lower_alu_width, bi_vectorize_filter, &gpu_id); NIR_PASS(_, nir, nir_lower_load_const_to_scalar); NIR_PASS(_, nir, nir_lower_phis_to_scalar, bi_vectorize_filter, &gpu_id); - NIR_PASS(_, nir, nir_lower_flrp, 16 | 32 | 64, false /* always_precise */); - NIR_PASS(_, nir, nir_lower_var_copies); - NIR_PASS(_, nir, nir_lower_alu); - NIR_PASS(_, nir, nir_lower_frag_coord_to_pixel_coord); - NIR_PASS(_, nir, pan_nir_lower_var_special_pan); - bi_lower_texture_late_nir(nir, gpu_id); -} - -static void -bi_lower_texture_nir(nir_shader *nir, uint64_t gpu_id) -{ - NIR_PASS(_, nir, nir_lower_image_atomics_to_global, NULL, NULL); - - /* on Bifrost, lower MSAA load/stores to 3D load/stores */ - if (pan_arch(gpu_id) < 9) - NIR_PASS(_, nir, pan_nir_lower_image_ms); - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS(_, nir, nir_lower_is_helper_invocation); - NIR_PASS(_, nir, pan_nir_lower_helper_invocation); - NIR_PASS(_, nir, pan_nir_lower_sample_pos); - } + bi_optimize_loop(nir, gpu_id, false /* allow_copies */); } static bool @@ -1136,18 +1194,6 @@ pan_nir_lower_buf_image_access(nir_shader *shader, unsigned arch) nir_metadata_control_flow, &arch); } -/* This must be called after any lowering of resource indices - * (panfrost_nir_lower_res_indices / panvk_per_arch(nir_lower_descriptors)) - * and lowering of attribute indices (pan_nir_lower_image_index / - * pan_nir_lower_texel_buffer_fetch_index) - */ -static void -bi_lower_texture_late_nir(nir_shader *nir, uint64_t gpu_id) -{ - NIR_PASS(_, nir, pan_nir_lower_texel_buffer_fetch, pan_arch(gpu_id)); - NIR_PASS(_, nir, pan_nir_lower_buf_image_access, pan_arch(gpu_id)); -} - /* Decide if Index-Driven Vertex Shading should be used for a given shader */ static bool bi_should_idvs(nir_shader *nir, const struct pan_compile_inputs *inputs) @@ -1179,6 +1225,10 @@ bi_should_idvs(nir_shader *nir, const struct pan_compile_inputs *inputs) * This allows * "dEQP-GLES31.functional.shaders.opaque_type_indexing.atomic_counter.*" to * pass under ANGLE. + * TODO: We should fix the tests and fix ANGLE too since VS atomics are not yet + * enabled for panvk (even though they should be). This pass is only here + * to pass a couple tests and breaks if any operation is inserted between + * the atomic operation and the varying store. */ static bool @@ -1252,43 +1302,6 @@ bifrost_compile_shader_nir(nir_shader *nir, bifrost_init_debug_options(); - /* The varying layout (if any) may have different bit sizes for some - * varyings than we have in the shader. For descriptors, this isn't a - * problem as it's handled by the descriptor layout. However, for direct - * loads and stores on Valhall+, we need the right bit sizes in the shader. - * We could do this in the back-end as we emit but it's easier for now to - * lower in NIR. This also handles the case where we do a load from the - * fragment shader of something that isn't written by the vertex shader. - * In that case, we just return zero. - */ - if (pan_arch(inputs->gpu_id) >= 9 && inputs->varying_layout) - NIR_PASS(_, nir, pan_nir_resize_varying_io, inputs->varying_layout); - - if (nir->info.stage == MESA_SHADER_VERTEX) { - info->vs.idvs = bi_should_idvs(nir, inputs); - - if (info->vs.idvs && nir->info.writes_memory) - NIR_PASS(_, nir, bifrost_nir_lower_vs_atomics); - - NIR_PASS(_, nir, pan_nir_lower_vs_outputs, inputs->gpu_id, - inputs->varying_layout, info->vs.idvs, - &info->vs.needs_extended_fifo); - } - - if (nir->info.stage == MESA_SHADER_FRAGMENT) { - /* Blit shaders may not need to run ATEST, since ATEST is not needed if - * early-z is forced, alpha-to-coverage is disabled, and there are no - * writes to the coverage mask. The latter two are satisfied for all - * blit shaders, so we just care about early-z, which blit shaders force - * iff they do not write depth or stencil - */ - const bool emit_zs = - nir->info.outputs_written & (BITFIELD_BIT(FRAG_RESULT_DEPTH) | - BITFIELD_BIT(FRAG_RESULT_STENCIL)); - const bool skip_atest = inputs->is_blit && !emit_zs; - NIR_PASS(_, nir, pan_nir_lower_fs_outputs, skip_atest); - } - bi_optimize_late(nir, inputs->gpu_id, inputs->robust_modes); /* Lower constants to scalar but then immediately fold so we get minimum- @@ -1304,22 +1317,6 @@ bifrost_compile_shader_nir(nir_shader *nir, info->tls_size = nir->scratch_size; info->stage = nir->info.stage; - if (nir->info.stage == MESA_SHADER_VERTEX) { - assert(inputs->varying_layout); - memcpy(&info->varyings.formats, inputs->varying_layout, - sizeof(*inputs->varying_layout)); - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { - pan_varying_collect_formats(&info->varyings.formats, - nir, inputs->gpu_id, - inputs->trust_varying_flat_highp_types, false); - info->varyings.noperspective = - pan_nir_collect_noperspective_varyings_fs(nir); - - if (!inputs->is_blend) - NIR_PASS(_, nir, pan_nir_lower_fs_inputs, inputs->gpu_id, - inputs->varying_layout, info); - } - if (nir->info.stage == MESA_SHADER_VERTEX && info->vs.idvs) { /* On 5th Gen, IDVS is only in one binary */ if (pan_arch(inputs->gpu_id) >= 12)