diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index d48733ae224..6400aa32eb3 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -47,13 +47,6 @@ typedef struct nir_def *chan[4]; } vs_output; -typedef struct -{ - nir_alu_type types[VARYING_SLOT_MAX][4]; - nir_alu_type types_16bit_lo[16][4]; - nir_alu_type types_16bit_hi[16][4]; -} shader_output_types; - typedef struct { const ac_nir_lower_ngg_options *options; @@ -91,20 +84,9 @@ typedef struct bool has_clipdist; /* outputs */ - nir_def *outputs[VARYING_SLOT_MAX][4]; - nir_def *outputs_16bit_lo[16][4]; - nir_def *outputs_16bit_hi[16][4]; - shader_output_types output_types; + ac_nir_prerast_out out; } lower_ngg_nogs_state; -typedef struct -{ - /* output stream index, 2 bit per component */ - uint8_t stream; - /* Bitmask of components used: 4 bits per slot, 1 bit per component. */ - uint8_t components_mask : 4; -} gs_output_info; - typedef struct { const ac_nir_lower_ngg_options *options; @@ -120,16 +102,8 @@ typedef struct unsigned lds_offs_primflags; bool output_compile_time_known; bool streamout_enabled; - /* 32 bit outputs */ - nir_def *outputs[VARYING_SLOT_MAX][4]; - gs_output_info output_info[VARYING_SLOT_MAX]; - /* 16 bit outputs */ - nir_def *outputs_16bit_hi[16][4]; - nir_def *outputs_16bit_lo[16][4]; - gs_output_info output_info_16bit_hi[16]; - gs_output_info output_info_16bit_lo[16]; - /* output types for both 32bit and 16bit */ - shader_output_types output_types; + /* Outputs */ + ac_nir_prerast_out out; /* Count per stream. */ nir_def *vertex_count[4]; nir_def *primitive_count[4]; @@ -661,7 +635,7 @@ emit_store_ngg_nogs_es_primitive_id(nir_builder *b, lower_ngg_nogs_state *s) prim_id = nir_load_primitive_id(b); } - s->outputs[VARYING_SLOT_PRIMITIVE_ID][0] = prim_id; + s->out.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = prim_id; /* Update outputs_written to reflect that the pass added a new output. */ b->shader->info.outputs_written |= VARYING_BIT_PRIMITIVE_ID; @@ -1710,11 +1684,11 @@ add_deferred_attribute_culling(nir_builder *b, nir_cf_list *original_extracted_c static void ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s) { - if (!s->outputs[VARYING_SLOT_EDGE][0]) + if (!s->out.outputs[VARYING_SLOT_EDGE][0]) return; /* clamp user edge flag to 1 for latter bit operations */ - nir_def *edgeflag = s->outputs[VARYING_SLOT_EDGE][0]; + nir_def *edgeflag = s->out.outputs[VARYING_SLOT_EDGE][0]; edgeflag = nir_umin(b, edgeflag, nir_imm_int(b, 1)); /* user edge flag is stored at the beginning of a vertex if streamout is not enabled */ @@ -1774,7 +1748,7 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s) /* Clear unused components. */ for (unsigned i = 0; i < 4; i++) { - if (!s->outputs[slot][i]) + if (!s->out.outputs[slot][i]) mask &= ~BITFIELD_BIT(i); } @@ -1787,7 +1761,7 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s) * Vulkan does not allow streamout outputs less than 32bit. * OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT. */ - nir_def *store_val = nir_vec(b, &s->outputs[slot][start], (unsigned)count); + nir_def *store_val = nir_vec(b, &s->out.outputs[slot][start], (unsigned)count); nir_store_shared(b, store_val, addr, .base = packed_location * 16 + start * 4); } } @@ -1802,14 +1776,14 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s) /* Clear unused components. */ for (unsigned i = 0; i < 4; i++) { - if (!s->outputs_16bit_lo[slot][i]) + if (!s->out.outputs_16bit_lo[slot][i]) mask_lo &= ~BITFIELD_BIT(i); - if (!s->outputs_16bit_hi[slot][i]) + if (!s->out.outputs_16bit_hi[slot][i]) mask_hi &= ~BITFIELD_BIT(i); } - nir_def **outputs_lo = s->outputs_16bit_lo[slot]; - nir_def **outputs_hi = s->outputs_16bit_hi[slot]; + nir_def **outputs_lo = s->out.outputs_16bit_lo[slot]; + nir_def **outputs_hi = s->out.outputs_16bit_hi[slot]; nir_def *undef = nir_undef(b, 1, 16); unsigned mask = mask_lo | mask_hi; @@ -1994,7 +1968,7 @@ ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info, unsigned stream, nir_def *so_buffer[4], nir_def *buffer_offsets[4], nir_def *vtx_buffer_idx, nir_def *vtx_lds_addr, - shader_output_types *output_types, + ac_nir_prerast_out *pr_out, bool skip_primitive_id) { nir_def *vtx_buffer_offsets[4]; @@ -2053,10 +2027,10 @@ ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info, if (out->high_16bits) { v = nir_unpack_32_2x16_split_y(b, v); - t = output_types->types_16bit_hi[index][c]; + t = pr_out->types_16bit_hi[index][c]; } else { v = nir_unpack_32_2x16_split_x(b, v); - t = output_types->types_16bit_lo[index][c]; + t = pr_out->types_16bit_lo[index][c]; } t = nir_alu_type_get_base_type(t); @@ -2112,7 +2086,7 @@ ngg_nogs_build_streamout(nir_builder *b, lower_ngg_nogs_state *s) nir_def *vtx_lds_addr = pervertex_lds_addr(b, vtx_lds_idx, vtx_lds_stride); ngg_build_streamout_vertex(b, info, 0, so_buffer, buffer_offsets, nir_iadd_imm(b, vtx_buffer_idx, i), - vtx_lds_addr, &s->output_types, s->skip_primitive_id); + vtx_lds_addr, &s->out, s->skip_primitive_id); } nir_pop_if(b, if_valid_vertex); } @@ -2188,56 +2162,7 @@ ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nog if (intrin->intrinsic != nir_intrinsic_store_output) continue; - assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1])); - - nir_io_semantics sem = nir_intrinsic_io_semantics(intrin); - unsigned slot = sem.location; - - nir_def **output; - nir_alu_type *type; - if (slot >= VARYING_SLOT_VAR0_16BIT) { - unsigned index = slot - VARYING_SLOT_VAR0_16BIT; - if (sem.high_16bits) { - output = s->outputs_16bit_hi[index]; - type = s->output_types.types_16bit_hi[index]; - } else { - output = s->outputs_16bit_lo[index]; - type = s->output_types.types_16bit_lo[index]; - } - } else { - output = s->outputs[slot]; - type = s->output_types.types[slot]; - } - - unsigned component = nir_intrinsic_component(intrin); - unsigned write_mask = nir_intrinsic_write_mask(intrin); - nir_alu_type src_type = nir_intrinsic_src_type(intrin); - b->cursor = nir_after_instr(instr); - - nir_def *store_val = intrin->src[0].ssa; - - /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */ - const bool non_dedicated_16bit = slot < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16; - - u_foreach_bit (i, write_mask) { - unsigned c = component + i; - nir_def *store_component = nir_channel(b, intrin->src[0].ssa, i); - if (non_dedicated_16bit) { - if (sem.high_16bits) { - nir_def *lo = output[c] ? nir_unpack_32_2x16_split_x(b, output[c]) : nir_imm_intN_t(b, 0, 16); - output[c] = nir_pack_32_2x16_split(b, lo, store_component); - } else { - nir_def *hi = output[c] ? nir_unpack_32_2x16_split_y(b, output[c]) : nir_imm_intN_t(b, 0, 16); - output[c] = nir_pack_32_2x16_split(b, store_component, hi); - } - type[c] = nir_type_uint32; - } else { - output[c] = store_component; - type[c] = src_type; - } - } - - /* remove all store output instructions */ + ac_nir_gather_prerast_store_output_info(b, intrin, &s->out); nir_instr_remove(instr); } } @@ -2418,9 +2343,9 @@ nogs_export_vertex_params(nir_builder *b, nir_function_impl *impl, const unsigned num_outputs = gather_vs_outputs(b, outputs, s->options->vs_output_param_offset, - s->outputs, - s->outputs_16bit_lo, - s->outputs_16bit_hi); + s->out.outputs, + s->out.outputs_16bit_lo, + s->out.outputs_16bit_hi); if (!num_outputs) return; @@ -2438,8 +2363,8 @@ nogs_export_vertex_params(nir_builder *b, nir_function_impl *impl, ac_nir_export_parameters(b, s->options->vs_output_param_offset, b->shader->info.outputs_written, b->shader->info.outputs_written_16bit, - s->outputs, s->outputs_16bit_lo, - s->outputs_16bit_hi); + s->out.outputs, s->out.outputs_16bit_lo, + s->out.outputs_16bit_hi); } } @@ -2608,7 +2533,7 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option nir_def *pos_val = nir_load_var(b, state.position_value_var); for (int i = 0; i < 4; i++) - state.outputs[VARYING_SLOT_POS][i] = nir_channel(b, pos_val, i); + state.out.outputs[VARYING_SLOT_POS][i] = nir_channel(b, pos_val, i); } /* Gather outputs data and types */ @@ -2650,12 +2575,12 @@ ac_nir_lower_ngg_nogs(nir_shader *shader, const ac_nir_lower_ngg_options *option options->clip_cull_dist_mask, !options->has_param_exports, options->force_vrs, !wait_attr_ring, - export_outputs, state.outputs, NULL); + export_outputs, state.out.outputs, NULL); nogs_export_vertex_params(b, impl, if_es_thread, num_es_threads, &state); if (wait_attr_ring) - export_pos0_wait_attr_ring(b, if_es_thread, state.outputs, options); + export_pos0_wait_attr_ring(b, if_es_thread, state.out.outputs, options); nir_metadata_preserve(impl, nir_metadata_none); nir_validate_shader(shader, "after emitting NGG VS/TES"); @@ -2773,101 +2698,13 @@ ngg_gs_clear_primflags(nir_builder *b, nir_def *num_vertices, unsigned stream, l static bool lower_ngg_gs_store_output(nir_builder *b, nir_intrinsic_instr *intrin, lower_ngg_gs_state *s) { - assert(nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1])); - b->cursor = nir_before_instr(&intrin->instr); - - unsigned writemask = nir_intrinsic_write_mask(intrin); - unsigned component_offset = nir_intrinsic_component(intrin); - nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin); - - unsigned location = io_sem.location; - - nir_def *store_val = intrin->src[0].ssa; - nir_alu_type src_type = nir_intrinsic_src_type(intrin); - - /* Small bitsize components consume the same amount of space as 32-bit components, - * but 64-bit ones consume twice as many. (Vulkan spec 15.1.5) - * - * 64-bit IO has been lowered to multi 32-bit IO. - */ - assert(store_val->bit_size <= 32); - assert(nir_alu_type_get_type_size(src_type) == store_val->bit_size); - - /* Get corresponding output variable and usage info. */ - nir_def **output; - nir_alu_type *type; - gs_output_info *info; - if (location >= VARYING_SLOT_VAR0_16BIT) { - unsigned index = location - VARYING_SLOT_VAR0_16BIT; - assert(index < 16); - - if (io_sem.high_16bits) { - output = s->outputs_16bit_hi[index]; - type = s->output_types.types_16bit_hi[index]; - info = s->output_info_16bit_hi + index; - } else { - output = s->outputs_16bit_lo[index]; - type = s->output_types.types_16bit_lo[index]; - info = s->output_info_16bit_lo + index; - } - } else { - assert(location < VARYING_SLOT_MAX); - output = s->outputs[location]; - type = s->output_types.types[location]; - info = s->output_info + location; - } - - for (unsigned comp = 0; comp < store_val->num_components; ++comp) { - if (!(writemask & (1 << comp))) - continue; - unsigned stream = (io_sem.gs_streams >> (comp * 2)) & 0x3; - if (!(b->shader->info.gs.active_stream_mask & (1 << stream))) - continue; - - unsigned component = component_offset + comp; - - /* The same output component should always belong to the same stream. */ - assert(!(info->components_mask & (1 << component)) || - ((info->stream >> (component * 2)) & 3) == stream); - - /* Components of the same output slot may belong to different streams. */ - info->stream |= stream << (component * 2); - info->components_mask |= BITFIELD_BIT(component); - - /* Assume we have called nir_lower_io_to_temporaries which store output in the - * same block as EmitVertex, so we don't need to use nir_variable for outputs. - */ - nir_def *store_component = nir_channel(b, store_val, comp); - - /* 16-bit output stored in a normal varying slot that isn't a dedicated 16-bit slot. */ - const bool non_dedicated_16bit = location < VARYING_SLOT_VAR0_16BIT && store_val->bit_size == 16; - - if (non_dedicated_16bit) { - if (io_sem.high_16bits) { - nir_def *lo = output[component] ? nir_unpack_32_2x16_split_x(b, output[component]) : nir_imm_intN_t(b, 0, 16); - output[component] = nir_pack_32_2x16_split(b, lo, store_component); - } else { - nir_def *hi = output[component] ? nir_unpack_32_2x16_split_y(b, output[component]) : nir_imm_intN_t(b, 0, 16); - output[component] = nir_pack_32_2x16_split(b, store_component, hi); - } - - /* Don't care about what type was set first, we mark this as a 32-bit unsigned. */ - type[component] = nir_type_uint32; - } else { - output[component] = store_component; - - /* If type is set multiple times, the value must be same. */ - assert(type[component] == nir_type_invalid || type[component] == src_type); - type[component] = src_type; - } - } - + ac_nir_gather_prerast_store_output_info(b, intrin, &s->out); nir_instr_remove(&intrin->instr); return true; } static unsigned -gs_output_component_mask_with_stream(gs_output_info *info, unsigned stream) +gs_output_component_mask_with_stream(ac_nir_prerast_per_output_info *info, unsigned stream) { unsigned mask = info->components_mask; if (!mask) @@ -2897,25 +2734,28 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri nir_def *current_vtx_per_prim = intrin->src[1].ssa; nir_def *gs_emit_vtx_addr = ngg_gs_emit_vertex_addr(b, gs_emit_vtx_idx, s); + /* Store generic 32-bit outputs to LDS. + * In case of packed 16-bit, we assume that has been already packed into 32 bit slots by now. + */ u_foreach_bit64(slot, b->shader->info.outputs_written) { - unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot))); - gs_output_info *info = &s->output_info[slot]; - nir_def **output = s->outputs[slot]; + const unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot))); + unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], stream); + + nir_def **output = s->out.outputs[slot]; + nir_def *undef = nir_undef(b, 1, 32); - unsigned mask = gs_output_component_mask_with_stream(info, stream); while (mask) { int start, count; u_bit_scan_consecutive_range(&mask, &start, &count); nir_def *values[4] = {0}; for (int c = start; c < start + count; ++c) { if (!output[c]) { - /* no one write to this output before */ - values[c - start] = nir_undef(b, 1, 32); - continue; + /* The shader hasn't written this output. */ + values[c - start] = undef; + } else { + assert(output[c]->bit_size == 32); + values[c - start] = output[c]; } - - /* extend 8/16 bit to 32 bit, 64 bit has been lowered */ - values[c - start] = nir_u2uN(b, output[c], 32); } nir_def *store_val = nir_vec(b, values, (unsigned)count); @@ -2925,21 +2765,22 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri } /* Clear all outputs (they are undefined after emit_vertex) */ - memset(s->outputs[slot], 0, sizeof(s->outputs[slot])); + memset(s->out.outputs[slot], 0, sizeof(s->out.outputs[slot])); } - /* Store 16bit outputs to LDS. */ - unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written); + const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written); + + /* Store dedicated 16-bit outputs to LDS. */ u_foreach_bit(slot, b->shader->info.outputs_written_16bit) { - unsigned packed_location = num_32bit_outputs + + const unsigned packed_location = num_32bit_outputs + util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(slot)); - unsigned mask_lo = gs_output_component_mask_with_stream(s->output_info_16bit_lo + slot, stream); - unsigned mask_hi = gs_output_component_mask_with_stream(s->output_info_16bit_hi + slot, stream); + const unsigned mask_lo = gs_output_component_mask_with_stream(s->out.infos_16bit_lo + slot, stream); + const unsigned mask_hi = gs_output_component_mask_with_stream(s->out.infos_16bit_hi + slot, stream); unsigned mask = mask_lo | mask_hi; - nir_def **output_lo = s->outputs_16bit_lo[slot]; - nir_def **output_hi = s->outputs_16bit_hi[slot]; + nir_def **output_lo = s->out.outputs_16bit_lo[slot]; + nir_def **output_hi = s->out.outputs_16bit_hi[slot]; nir_def *undef = nir_undef(b, 1, 16); while (mask) { @@ -2960,8 +2801,8 @@ lower_ngg_gs_emit_vertex_with_counter(nir_builder *b, nir_intrinsic_instr *intri } /* Clear all outputs (they are undefined after emit_vertex) */ - memset(s->outputs_16bit_lo[slot], 0, sizeof(s->outputs_16bit_lo[slot])); - memset(s->outputs_16bit_hi[slot], 0, sizeof(s->outputs_16bit_hi[slot])); + memset(s->out.outputs_16bit_lo[slot], 0, sizeof(s->out.outputs_16bit_lo[slot])); + memset(s->out.outputs_16bit_hi[slot], 0, sizeof(s->out.outputs_16bit_hi[slot])); } /* Calculate and store per-vertex primitive flags based on vertex counts: @@ -3113,11 +2954,10 @@ ngg_gs_export_vertices(nir_builder *b, nir_def *max_num_out_vtx, nir_def *tid_in } u_foreach_bit64(slot, b->shader->info.outputs_written) { - unsigned packed_location = + const unsigned packed_location = util_bitcount64((b->shader->info.outputs_written & BITFIELD64_MASK(slot))); - gs_output_info *info = &s->output_info[slot]; - unsigned mask = gs_output_component_mask_with_stream(info, 0); + unsigned mask = gs_output_component_mask_with_stream(&s->out.infos[slot], 0); while (mask) { int start, count; @@ -3128,20 +2968,19 @@ ngg_gs_export_vertices(nir_builder *b, nir_def *max_num_out_vtx, nir_def *tid_in .align_mul = 4); for (int i = 0; i < count; i++) - s->outputs[slot][start + i] = nir_channel(b, load, i); + s->out.outputs[slot][start + i] = nir_channel(b, load, i); } } - /* 16bit outputs */ - unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written); + const unsigned num_32bit_outputs = util_bitcount64(b->shader->info.outputs_written); + + /* Dedicated 16-bit outputs. */ u_foreach_bit(i, b->shader->info.outputs_written_16bit) { - unsigned packed_location = num_32bit_outputs + + const unsigned packed_location = num_32bit_outputs + util_bitcount(b->shader->info.outputs_written_16bit & BITFIELD_MASK(i)); - gs_output_info *info_lo = s->output_info_16bit_lo + i; - gs_output_info *info_hi = s->output_info_16bit_hi + i; - unsigned mask_lo = gs_output_component_mask_with_stream(info_lo, 0); - unsigned mask_hi = gs_output_component_mask_with_stream(info_hi, 0); + const unsigned mask_lo = gs_output_component_mask_with_stream(&s->out.infos_16bit_lo[i], 0); + const unsigned mask_hi = gs_output_component_mask_with_stream(&s->out.infos_16bit_hi[i], 0); unsigned mask = mask_lo | mask_hi; while (mask) { @@ -3157,10 +2996,10 @@ ngg_gs_export_vertices(nir_builder *b, nir_def *max_num_out_vtx, nir_def *tid_in unsigned comp = start + j; if (mask_lo & BITFIELD_BIT(comp)) - s->outputs_16bit_lo[i][comp] = nir_unpack_32_2x16_split_x(b, val); + s->out.outputs_16bit_lo[i][comp] = nir_unpack_32_2x16_split_x(b, val); if (mask_hi & BITFIELD_BIT(comp)) - s->outputs_16bit_hi[i][comp] = nir_unpack_32_2x16_split_y(b, val); + s->out.outputs_16bit_hi[i][comp] = nir_unpack_32_2x16_split_y(b, val); } } } @@ -3179,7 +3018,7 @@ ngg_gs_export_vertices(nir_builder *b, nir_def *max_num_out_vtx, nir_def *tid_in s->options->clip_cull_dist_mask, !s->options->has_param_exports, s->options->force_vrs, !wait_attr_ring, - export_outputs, s->outputs, NULL); + export_outputs, s->out.outputs, NULL); nir_pop_if(b, if_vtx_export_thread); @@ -3190,8 +3029,8 @@ ngg_gs_export_vertices(nir_builder *b, nir_def *max_num_out_vtx, nir_def *tid_in vs_output outputs[64]; unsigned num_outputs = gather_vs_outputs(b, outputs, s->options->vs_output_param_offset, - s->outputs, s->outputs_16bit_lo, - s->outputs_16bit_hi); + s->out.outputs, s->out.outputs_16bit_lo, + s->out.outputs_16bit_hi); if (num_outputs) { b->cursor = nir_after_impl(s->impl); @@ -3204,13 +3043,13 @@ ngg_gs_export_vertices(nir_builder *b, nir_def *max_num_out_vtx, nir_def *tid_in ac_nir_export_parameters(b, s->options->vs_output_param_offset, b->shader->info.outputs_written, b->shader->info.outputs_written_16bit, - s->outputs, s->outputs_16bit_lo, - s->outputs_16bit_hi); + s->out.outputs, s->out.outputs_16bit_lo, + s->out.outputs_16bit_hi); } } if (wait_attr_ring) - export_pos0_wait_attr_ring(b, if_vtx_export_thread, s->outputs, s->options); + export_pos0_wait_attr_ring(b, if_vtx_export_thread, s->out.outputs, s->options); } static void @@ -3459,7 +3298,7 @@ ngg_gs_build_streamout(nir_builder *b, lower_ngg_gs_state *s) buffer_offsets, nir_iadd_imm(b, vtx_buffer_idx, i), exported_vtx_lds_addr[i], - &s->output_types, false); + &s->out, false); } } nir_pop_if(b, if_emit);