diff --git a/src/amd/common/ac_nir_lower_ngg.c b/src/amd/common/ac_nir_lower_ngg.c index b4462ccff11..34a949ed912 100644 --- a/src/amd/common/ac_nir_lower_ngg.c +++ b/src/amd/common/ac_nir_lower_ngg.c @@ -2499,122 +2499,6 @@ ngg_nogs_gather_outputs(nir_builder *b, struct exec_list *cf_list, lower_ngg_nog } } -static unsigned -gather_vs_outputs(nir_builder *b, vs_output *outputs, - const uint8_t *param_offsets, - nir_def *(*data)[4], - nir_def *(*data_16bit_lo)[4], - nir_def *(*data_16bit_hi)[4]) -{ - unsigned num_outputs = 0; - u_foreach_bit64 (slot, b->shader->info.outputs_written) { - if (param_offsets[slot] > AC_EXP_PARAM_OFFSET_31) - continue; - - nir_def **output = data[slot]; - - /* skip output if no one written before */ - if (!output[0] && !output[1] && !output[2] && !output[3]) - continue; - - outputs[num_outputs].slot = slot; - for (int i = 0; i < 4; i++) { - outputs[num_outputs].chan[i] = output[i]; - } - num_outputs++; - } - - u_foreach_bit (i, b->shader->info.outputs_written_16bit) { - unsigned slot = VARYING_SLOT_VAR0_16BIT + i; - if (param_offsets[slot] > AC_EXP_PARAM_OFFSET_31) - continue; - - nir_def **output_lo = data_16bit_lo[i]; - nir_def **output_hi = data_16bit_hi[i]; - - /* skip output if no one written before */ - if (!output_lo[0] && !output_lo[1] && !output_lo[2] && !output_lo[3] && - !output_hi[0] && !output_hi[1] && !output_hi[2] && !output_hi[3]) - continue; - - vs_output *output = &outputs[num_outputs++]; - output->slot = slot; - - nir_def *undef = nir_undef(b, 1, 16); - for (int j = 0; j < 4; j++) { - nir_def *lo = output_lo[j] ? output_lo[j] : undef; - nir_def *hi = output_hi[j] ? output_hi[j] : undef; - if (output_lo[j] || output_hi[j]) - output->chan[j] = nir_pack_32_2x16_split(b, lo, hi); - else - output->chan[j] = NULL; - } - } - - return num_outputs; -} - -static void -create_vertex_param_phis(nir_builder *b, unsigned num_outputs, vs_output *outputs) -{ - nir_def *undef = nir_undef(b, 1, 32); /* inserted at the start of the shader */ - - for (unsigned i = 0; i < num_outputs; i++) { - for (unsigned j = 0; j < 4; j++) { - if (outputs[i].chan[j]) - outputs[i].chan[j] = nir_if_phi(b, outputs[i].chan[j], undef); - } - } -} - -static void -export_vertex_params_gfx11(nir_builder *b, nir_def *export_tid, nir_def *num_export_threads, - unsigned num_outputs, vs_output *outputs, - const uint8_t *vs_output_param_offset) -{ - nir_def *attr_rsrc = nir_load_ring_attr_amd(b); - - /* We should always store full vec4s in groups of 8 lanes for the best performance even if - * some of them are garbage or have unused components, so align the number of export threads - * to 8. - */ - num_export_threads = nir_iand_imm(b, nir_iadd_imm(b, num_export_threads, 7), ~7); - if (!export_tid) - nir_push_if(b, nir_is_subgroup_invocation_lt_amd(b, num_export_threads)); - else - nir_push_if(b, nir_ult(b, export_tid, num_export_threads)); - - nir_def *attr_offset = nir_load_ring_attr_offset_amd(b); - nir_def *vindex = nir_load_local_invocation_index(b); - nir_def *voffset = nir_imm_int(b, 0); - nir_def *undef = nir_undef(b, 1, 32); - - uint32_t exported_params = 0; - - for (unsigned i = 0; i < num_outputs; i++) { - gl_varying_slot slot = outputs[i].slot; - unsigned offset = vs_output_param_offset[slot]; - - /* Since vs_output_param_offset[] can map multiple varying slots to - * the same param export index (that's radeonsi-specific behavior), - * we need to do this so as not to emit duplicated exports. - */ - if (exported_params & BITFIELD_BIT(offset)) - continue; - - nir_def *comp[4]; - for (unsigned j = 0; j < 4; j++) - comp[j] = outputs[i].chan[j] ? outputs[i].chan[j] : undef; - nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, attr_offset, vindex, - .base = offset * 16, - .memory_modes = nir_var_shader_out, - .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD); - exported_params |= BITFIELD_BIT(offset); - } - - nir_pop_if(b, NULL); -} - static void create_output_phis(nir_builder *b, const uint64_t outputs_written, const uint64_t outputs_written_16bit, ac_nir_prerast_out *out) {