diff --git a/docs/envvars.rst b/docs/envvars.rst index a61d694aa47..16a83d21cfd 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -773,12 +773,6 @@ radeonsi driver environment variables Always use NGG culling even when it can hurt. ``nonggc`` Disable NGG culling. - ``alwayspd`` - Always enable the primitive discard compute shader. - ``pd`` - Enable the primitive discard compute shader for large draw calls. - ``nopd`` - Disable the primitive discard compute shader. ``switch_on_eop`` Program WD/IA to switch on end-of-packet. ``nooutoforder`` diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 6b72bfb8003..7b2ef80f5a2 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -80,9 +80,6 @@ enum radeon_bo_flag enum radeon_dependency_flag { - /* Add the dependency to the parallel compute IB only. */ - RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY = 1 << 0, - /* Instead of waiting for a job to finish execution, the dependency will * be signaled when the job starts execution. */ @@ -512,26 +509,6 @@ struct radeon_winsys { struct pipe_fence_handle **fence), void *flush_ctx, bool stop_exec_on_failure); - /** - * Add a parallel compute IB to a gfx IB. It will share the buffer list - * and fence dependencies with the gfx IB. The gfx flush call will submit - * both IBs at the same time. - * - * The compute IB doesn't have an output fence, so the primary IB has - * to use a wait packet for synchronization. - * - * The returned IB is only a stream for writing packets to the new - * IB. The only function that can be used on the compute cs is cs_check_space. - * - * \param compute_cs The returned structure of the command stream. - * \param gfx_cs Gfx IB - * - * \return true on success - */ - bool (*cs_add_parallel_compute_ib)(struct radeon_cmdbuf *compute_cs, - struct radeon_cmdbuf *gfx_cs, - bool uses_gds_ordered_append); - /** * Set up and enable mid command buffer preemption for the command stream. * diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 4b734d2b1ef..79af306e29c 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -27,7 +27,6 @@ files_libradeonsi = files( 'si_build_pm4.h', 'si_clear.c', 'si_compute.c', - 'si_compute_prim_discard.c', 'si_compute.h', 'si_compute_blit.c', 'si_cp_dma.c', diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c deleted file mode 100644 index 67e42801a3f..00000000000 --- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c +++ /dev/null @@ -1,1072 +0,0 @@ -/* - * Copyright 2019 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - */ - -#include "ac_llvm_cull.h" -#include "si_build_pm4.h" -#include "si_pipe.h" -#include "si_shader_internal.h" -#include "sid.h" -#include "util/u_upload_mgr.h" - -/* Based on: - * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf - */ - -/* This file implements primitive culling using asynchronous compute. - * - * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it - * in a compute shader. The shader processes 1 primitive/thread by invoking - * the VS for each vertex to get the positions, decomposes strips - * into triangles (if needed), eliminates primitive restart (if needed), - * does (W<0) culling, face culling, view XY culling, zero-area and - * small-primitive culling, and generates a new index buffer that doesn't - * contain culled primitives. - * - * There is no primitive ordering. The generated index buffer will contain - * primitives in a random order. - * - * IB = a GPU command buffer - * - * Both the compute and gfx IBs run in parallel sort of like CE and DE. - * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND - * doesn't continue if its word isn't 0x80000000. The vertex count is being - * atomically incremented within the draw packet. A CS_DONE event will signal - * the REWIND packet to continue. It's really a direct draw with command - * buffer patching from the compute queue. - * - * The compute IB doesn't have to start when its corresponding gfx IB starts, - * but can start sooner. The compute IB is signaled to start after the last - * execution barrier in the *previous* gfx IB. This is handled as follows. - * The kernel GPU scheduler starts the compute IB after the previous gfx IB has - * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that - * represents the barrier in the previous gfx IB. - * - * Features: - * - Triangle strips are decomposed into an indexed triangle list. - * The decomposition differs based on the provoking vertex state. - * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling). - * - Back face culling, incl. culling zero-area / degenerate primitives. - * - View XY culling. - * - Small primitive culling for all MSAA modes and all quant modes. - * - * The following are not implemented: - * - ClipVertex/ClipDistance/CullDistance-based culling. - * - Scissor culling. - * - HiZ culling. - * - * Limitations (and unimplemented features that may be possible to implement): - * - Only triangles and triangle strips are supported. - * - Primitive restart is not supported. - * - Instancing is unsupported. - * - Multidraws where the vertex shader reads gl_DrawID are unsupported. - * - No support for tessellation and geometry shaders. - * (patch elimination where tess factors are 0 would be possible to implement) - * - The vertex shader must not contain memory stores. - * - All VS resources must not have a write usage in the command buffer. - * - Bindless textures and images must not occur in the vertex shader. - * - * User data SGPR layout: - * VERTEX_COUNTER: address of "count" in the draw packet incremented atomically by the shader. - * START_OUT_INDEX: output index buffer offset / 12 - * START_IN_INDEX: input index buffer offset / index_size - * VS.BASE_VERTEX: same value as VS - * INDEX_BUFFERS: pointer to constants - * 0..3: input index buffer - typed buffer view - * 4..7: output index buffer - typed buffer view - * 8..11: viewport state - scale.xy, translate.xy - * VS.VERTEX_BUFFERS: same value as VS - * VS.CONST_AND_SHADER_BUFFERS: same value as VS - * VS.SAMPLERS_AND_IMAGES: same value as VS - * VS.START_INSTANCE: same value as VS - * SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number. - * - * How to test primitive restart (the most complicated part because it needs - * to get the primitive orientation right): - * Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave - * primitive orientation flips with small draw calls, which is what most tests use. - * You can also enable draw call splitting into draw calls with just 2 primitives. - */ - -/* At least 256 is needed for the fastest wave launch rate from compute queues - * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */ -#define THREADGROUP_SIZE 256 /* high numbers limit available VGPRs */ -#define THREADGROUPS_PER_CU 1 /* TGs to launch on 1 CU before going onto the next, max 8 */ -#define MAX_WAVES_PER_SH 0 /* no limit */ -#define INDEX_STORES_USE_SLC 1 /* don't cache indices if L2 is full */ - -/* Grouping compute dispatches for small draw calls: How many primitives from multiple - * draw calls to process by compute before signaling the gfx IB. This reduces the number - * of EOP events + REWIND packets, because they decrease performance. - * This also determines the granularity of draw-level and packet-level splitting. - */ -#define PRIMS_PER_IB (1024 * 1024) /* size per gfx IB */ -#define PRIMS_PER_BATCH (128 * 1024) /* size between REWIND packets */ - -/* Derived values. */ -#define WAVES_PER_TG DIV_ROUND_UP(THREADGROUP_SIZE, 64) - -#define REWIND_SIGNAL_BIT 0x80000000 - -static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr); - -void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context, - unsigned *prim_discard_vertex_count_threshold, - unsigned *index_ring_size_per_ib) -{ - *prim_discard_vertex_count_threshold = UINT_MAX; /* disable */ - - if (sscreen->info.chip_class <= GFX7 || /* SI-CI support is not implemented */ - sscreen->debug_flags & DBG(NO_PD) || is_aux_context) - return; - - /* TODO: enable this */ - bool enable_by_default = false; - - if (sscreen->debug_flags & DBG(ALWAYS_PD) || sscreen->debug_flags & DBG(PD) || - (enable_by_default && sscreen->allow_draw_out_of_order && - sscreen->info.num_se >= 2)) { - *prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */ - - if (sscreen->debug_flags & DBG(ALWAYS_PD)) - *prim_discard_vertex_count_threshold = 0; /* always enable */ - - /* The total size is double this per context. Greater numbers allow bigger gfx IBs. */ - *index_ring_size_per_ib = PRIMS_PER_IB * 12; /* 3 32-bit indices per primitive. */ - } -} - -static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr) -{ - uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32; - ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->ac.i64, ""); - ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->ac.i64, hi, 0), ""); - return LLVMBuildIntToPtr(ctx->ac.builder, ptr, - LLVMPointerType(ctx->ac.i32, AC_ADDR_SPACE_GLOBAL), ""); -} - -struct si_thread0_section { - struct si_shader_context *ctx; - LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */ - LLVMValueRef saved_exec; -}; - -/* Enter a section that only executes on thread 0. */ -static void si_enter_thread0_section(struct si_shader_context *ctx, - struct si_thread0_section *section, LLVMValueRef thread_id, - LLVMValueRef check_nonzero) -{ - section->ctx = ctx; - section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->ac.i32, "result0"); - - /* This IF has 4 instructions: - * v_and_b32_e32 v, 63, v ; get the thread ID - * v_cmp_eq_u32_e32 vcc, 0, v ; thread ID == 0 - * s_and_saveexec_b64 s, vcc - * s_cbranch_execz BB0_4 - * - * It could just be s_and_saveexec_b64 s, 1. - */ - LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id, ctx->ac.i32_0, ""); - if (check_nonzero) { - cond = LLVMBuildAnd(ctx->ac.builder, cond, - LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, check_nonzero, - ctx->ac.i32_0, ""), ""); - } - ac_build_ifcc(&ctx->ac, cond, 12601); -} - -/* Exit a section that only executes on thread 0 and broadcast the result - * to all threads. */ -static void si_exit_thread0_section(struct si_thread0_section *section, LLVMValueRef *result) -{ - struct si_shader_context *ctx = section->ctx; - - LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result); - - ac_build_endif(&ctx->ac, 12601); - - /* Broadcast the result from thread 0 to all threads. */ - *result = - ac_build_readlane(&ctx->ac, LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL); -} - -static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted, - void *data); - -void si_build_prim_discard_compute_shader(struct si_shader_context *ctx) -{ - struct si_shader_key *key = &ctx->shader->key; - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef vs = ctx->main_fn; - - /* Always inline the VS function. */ - ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE); - LLVMSetLinkage(vs, LLVMPrivateLinkage); - - enum ac_arg_type const_desc_type; - if (ctx->shader->selector->info.base.num_ubos == 1 && - ctx->shader->selector->info.base.num_ssbos == 0) - const_desc_type = AC_ARG_CONST_FLOAT_PTR; - else - const_desc_type = AC_ARG_CONST_DESC_PTR; - - memset(&ctx->args, 0, sizeof(ctx->args)); - - struct ac_arg param_index_buffers_and_constants, param_vertex_counter; - struct ac_arg param_vb_desc, param_const_desc, param_start_out_index; - struct ac_arg param_base_vertex, param_start_instance, param_start_in_index; - struct ac_arg param_block_id, param_local_id, param_smallprim_precision; - struct ac_arg param_sampler_desc; - - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_vertex_counter); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_out_index); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_in_index); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_base_vertex); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_index_buffers_and_constants); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_DESC_PTR, ¶m_vb_desc); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, const_desc_type, ¶m_const_desc); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_CONST_IMAGE_PTR, ¶m_sampler_desc); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_start_instance); - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_FLOAT, ¶m_smallprim_precision); - - /* Block ID and thread ID inputs. */ - ac_add_arg(&ctx->args, AC_ARG_SGPR, 1, AC_ARG_INT, ¶m_block_id); - ac_add_arg(&ctx->args, AC_ARG_VGPR, 1, AC_ARG_INT, ¶m_local_id); - - /* Create the compute shader function. */ - gl_shader_stage old_stage = ctx->stage; - ctx->stage = MESA_SHADER_COMPUTE; - si_llvm_create_func(ctx, "prim_discard_cs", NULL, 0, THREADGROUP_SIZE); - ctx->stage = old_stage; - - /* Assemble parameters for VS. */ - LLVMValueRef vs_params[16]; - unsigned num_vs_params = 0; - unsigned param_vertex_id; - - vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* INTERNAL RESOURCES */ - vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */ - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_const_desc); - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_sampler_desc); - vs_params[num_vs_params++] = - LLVMConstInt(ctx->ac.i32, S_VS_STATE_INDEXED(key->opt.cs_indexed), 0); - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_base_vertex); - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_start_instance); - vs_params[num_vs_params++] = ctx->ac.i32_0; /* DrawID */ - vs_params[num_vs_params++] = ac_get_arg(&ctx->ac, param_vb_desc); - - vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */ - vs_params[num_vs_params++] = ctx->ac.i32_0; /* InstanceID */ - vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused (PrimID) */ - vs_params[num_vs_params++] = ctx->ac.i32_0; /* unused */ - - assert(num_vs_params <= ARRAY_SIZE(vs_params)); - assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs)))); - - /* Load descriptors. (load 8 dwords at once) */ - LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8]; - - LLVMValueRef index_buffers_and_constants = - ac_get_arg(&ctx->ac, param_index_buffers_and_constants); - tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants, - ac_array_in_const32_addr_space(ctx->ac.v8i32), ""); - tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->ac.i32_0); - - for (unsigned i = 0; i < 8; i++) - desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i); - - input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4); - output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4); - - /* Compute PrimID. */ - LLVMValueRef global_thread_id = ac_build_imad(&ctx->ac, ac_get_arg(&ctx->ac, param_block_id), - LLVMConstInt(ctx->ac.i32, THREADGROUP_SIZE, 0), - ac_get_arg(&ctx->ac, param_local_id)); - LLVMValueRef prim_id = global_thread_id; - - /* Generate indices (like a non-indexed draw call). */ - LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->ac.i32)}; - unsigned vertices_per_prim = 3; - - switch (key->opt.cs_prim_type) { - case PIPE_PRIM_TRIANGLES: - for (unsigned i = 0; i < 3; i++) { - index[i] = ac_build_imad(&ctx->ac, prim_id, LLVMConstInt(ctx->ac.i32, 3, 0), - LLVMConstInt(ctx->ac.i32, i, 0)); - } - break; - case PIPE_PRIM_TRIANGLE_STRIP: - for (unsigned i = 0; i < 3; i++) { - index[i] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->ac.i32, i, 0), ""); - } - break; - default: - unreachable("unexpected primitive type"); - } - - /* Fetch indices. */ - if (key->opt.cs_indexed) { - for (unsigned i = 0; i < 3; i++) { - index[i] = LLVMBuildAdd(builder, index[i], ac_get_arg(&ctx->ac, param_start_in_index), ""); - index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0, - 1, 0, true, false, false); - index[i] = ac_to_integer(&ctx->ac, index[i]); - } - } - - LLVMValueRef thread_id = LLVMBuildAnd(builder, ac_get_arg(&ctx->ac, param_local_id), - LLVMConstInt(ctx->ac.i32, 63, 0), ""); - - /* Every other triangle in a strip has a reversed vertex order, so we - * need to swap vertices of odd primitives to get the correct primitive - * orientation when converting triangle strips to triangles. Primitive - * restart complicates it, because a strip can start anywhere. - */ - LLVMValueRef prim_restart_accepted = ctx->ac.i1true; - LLVMValueRef vertex_counter = ac_get_arg(&ctx->ac, param_vertex_counter); - - if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) { - /* Without primitive restart, odd primitives have reversed orientation. - * Only primitive restart can flip it with respect to the first vertex - * of the draw call. - */ - /* prim_is_odd = current_is_odd % 2. */ - LLVMValueRef prim_is_odd = LLVMBuildXor( - builder, ctx->ac.i1false, LLVMBuildTrunc(builder, thread_id, ctx->ac.i1, ""), ""); - - /* Convert triangle strip indices to triangle indices. */ - ac_build_triangle_strip_indices_to_triangle( - &ctx->ac, prim_is_odd, LLVMConstInt(ctx->ac.i1, key->opt.cs_provoking_vertex_first, 0), - index); - } - - /* Execute the vertex shader for each vertex to get vertex positions. */ - LLVMValueRef pos[3][4]; - for (unsigned i = 0; i < vertices_per_prim; i++) { - vs_params[param_vertex_id] = index[i]; - - LLVMValueRef ret = ac_build_call(&ctx->ac, vs, vs_params, num_vs_params); - for (unsigned chan = 0; chan < 4; chan++) - pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, ""); - } - - /* Divide XYZ by W. */ - for (unsigned i = 0; i < vertices_per_prim; i++) { - for (unsigned chan = 0; chan < 3; chan++) - pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]); - } - - /* Load the viewport state. */ - LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants, - LLVMConstInt(ctx->ac.i32, 2, 0)); - vp = LLVMBuildBitCast(builder, vp, ctx->ac.v4f32, ""); - LLVMValueRef vp_scale[2], vp_translate[2]; - vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0); - vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1); - vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2); - vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3); - - /* Do culling. */ - struct ac_cull_options options = {}; - options.cull_front = key->opt.cs_cull_front; - options.cull_back = key->opt.cs_cull_back; - options.cull_view_xy = true; - options.cull_small_prims = true; - options.cull_zero_area = true; - options.cull_w = true; - - LLVMValueRef params[] = { - vertex_counter, - output_indexbuf, - (void*)index, - ac_get_arg(&ctx->ac, param_start_out_index), - }; - - ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted, vp_scale, vp_translate, - ac_get_arg(&ctx->ac, param_smallprim_precision), &options, - si_build_primitive_accepted, params); - LLVMBuildRetVoid(builder); -} - -static void si_build_primitive_accepted(struct ac_llvm_context *ac, LLVMValueRef accepted, - void *userdata) -{ - struct si_shader_context *ctx = container_of(ac, struct si_shader_context, ac); - LLVMBuilderRef builder = ctx->ac.builder; - unsigned vertices_per_prim = 3; - LLVMValueRef *params = (LLVMValueRef *)userdata; - LLVMValueRef vertex_counter = params[0]; - LLVMValueRef output_indexbuf = params[1]; - LLVMValueRef *index = (LLVMValueRef *)params[2]; - LLVMValueRef start_out_index = params[3]; - - LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted); - - ac_build_ifcc(&ctx->ac, accepted, 16607); - - /* Count the number of active threads by doing bitcount(accepted). */ - LLVMValueRef num_prims_accepted = ac_build_bit_count(&ctx->ac, accepted_threadmask); - num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->ac.i32, ""); - - /* Get the number of bits set before the index of this thread. */ - LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask); - LLVMValueRef start; - - /* Execute atomic_add on the vertex count. */ - struct si_thread0_section section; - si_enter_thread0_section(ctx, §ion, prim_index, num_prims_accepted); - { - LLVMValueRef num_indices = LLVMBuildMul( - builder, num_prims_accepted, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter); - start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd, vertex_counter, num_indices, - LLVMAtomicOrderingMonotonic, false); - } - si_exit_thread0_section(§ion, &start); - - /* Convert it into the primitive index. */ - start = LLVMBuildUDiv(builder, start, LLVMConstInt(ctx->ac.i32, vertices_per_prim, 0), ""); - - /* Now we need to store the indices of accepted primitives into - * the output index buffer. - */ - - /* Write indices for accepted primitives. */ - LLVMValueRef vindex = LLVMBuildAdd(builder, start, prim_index, ""); - vindex = LLVMBuildAdd(builder, vindex, start_out_index, ""); - LLVMValueRef vdata = ac_build_gather_values(&ctx->ac, index, 3); - - if (!ac_has_vec3_support(ctx->ac.chip_class, true)) - vdata = ac_build_expand_to_vec4(&ctx->ac, vdata, 3); - - ac_build_buffer_store_format(&ctx->ac, output_indexbuf, vdata, vindex, ctx->ac.i32_0, - ac_glc | (INDEX_STORES_USE_SLC ? ac_slc : 0)); - ac_build_endif(&ctx->ac, 16607); -} - -/* Return false if the shader isn't ready. */ -static bool si_shader_select_prim_discard_cs(struct si_context *sctx, - const struct pipe_draw_info *info) -{ - struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; - struct si_shader_key key; - - memset(&key, 0, sizeof(key)); - si_shader_selector_key_vs(sctx, sctx->shader.vs.cso, &key, &key.part.vs.prolog); - assert(!key.part.vs.prolog.instance_divisor_is_fetched); - - key.opt.vs_as_prim_discard_cs = 1; - key.opt.cs_prim_type = info->mode; - key.opt.cs_indexed = info->index_size != 0; - key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first; - - if (rs->rasterizer_discard) { - /* Just for performance testing and analysis of trivial bottlenecks. - * This should result in a very short compute shader. */ - key.opt.cs_cull_front = 1; - key.opt.cs_cull_back = 1; - } else { - key.opt.cs_cull_front = sctx->viewport0_y_inverted ? rs->cull_back : rs->cull_front; - key.opt.cs_cull_back = sctx->viewport0_y_inverted ? rs->cull_front : rs->cull_back; - } - - sctx->cs_prim_discard_state.cso = sctx->shader.vs.cso; - sctx->cs_prim_discard_state.current = NULL; - - if (!sctx->compiler.passes) - si_init_compiler(sctx->screen, &sctx->compiler); - - struct si_compiler_ctx_state compiler_state; - compiler_state.compiler = &sctx->compiler; - compiler_state.debug = sctx->debug; - compiler_state.is_debug_context = sctx->is_debug; - - return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state, &compiler_state, - &key, -1, true) == 0 && - /* Disallow compute shaders using the scratch buffer. */ - sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0; -} - -static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx) -{ - if (sctx->index_ring) - return true; - - if (!sctx->prim_discard_compute_cs.priv) { - struct radeon_winsys *ws = sctx->ws; - - if (!ws->cs_add_parallel_compute_ib(&sctx->prim_discard_compute_cs, - &sctx->gfx_cs, false)) - return false; - } - - if (!sctx->index_ring) { - sctx->index_ring = si_aligned_buffer_create( - sctx->b.screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL, - PIPE_USAGE_DEFAULT, - sctx->index_ring_size_per_ib * 2, sctx->screen->info.pte_fragment_size); - if (!sctx->index_ring) - return false; - } - return true; -} - -static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size) -{ - return sctx->index_ring_offset + - align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <= - sctx->index_ring_size_per_ib; -} - -#define COMPUTE_PREAMBLE_SIZE (8 + 39 + 11 + 7) - -enum si_prim_discard_outcome -si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info, - unsigned drawid_offset, - const struct pipe_draw_start_count_bias *draws, - unsigned num_draws, unsigned total_count) -{ - /* If the compute shader compilation isn't finished, this returns false. */ - if (!si_shader_select_prim_discard_cs(sctx, info)) - return SI_PRIM_DISCARD_DISABLED; - - if (!si_initialize_prim_discard_cmdbuf(sctx)) - return SI_PRIM_DISCARD_DISABLED; - - struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs; - unsigned prim = info->mode; - - unsigned num_prims; - if (prim == PIPE_PRIM_TRIANGLES) - num_prims = total_count / 3; - else if (prim == PIPE_PRIM_TRIANGLE_STRIP) - num_prims = total_count - 2; /* approximation ignoring multi draws */ - else - unreachable("shouldn't get here"); - - unsigned out_indexbuf_size = num_prims * 12; - bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size); - - /* Split draws at the draw call level if the ring is full. This makes - * better use of the ring space. - */ - if (ring_full && num_prims > PRIMS_PER_BATCH) { - unsigned vert_count_per_subdraw = 0; - - if (prim == PIPE_PRIM_TRIANGLES) - vert_count_per_subdraw = PRIMS_PER_BATCH * 3; - else if (prim == PIPE_PRIM_TRIANGLE_STRIP) - vert_count_per_subdraw = PRIMS_PER_BATCH; - - /* Split multi draws first. */ - if (num_draws > 1) { - unsigned count = 0; - unsigned first_draw = 0; - unsigned num_draws_split = 0; - - for (unsigned i = 0; i < num_draws; i++) { - if (count && count + draws[i].count > vert_count_per_subdraw) { - /* Submit previous draws. */ - sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split); - count = 0; - first_draw = i; - num_draws_split = 0; - } - - if (draws[i].count > vert_count_per_subdraw) { - /* Submit just 1 draw. It will be split. */ - sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + i, 1); - assert(count == 0); - assert(first_draw == i); - assert(num_draws_split == 0); - first_draw = i + 1; - continue; - } - - count += draws[i].count; - num_draws_split++; - } - - if (count) { - /* Submit the remaining draws. */ - assert(num_draws_split > 0); - sctx->b.draw_vbo(&sctx->b, info, drawid_offset, NULL, draws + first_draw, num_draws_split); - } - return SI_PRIM_DISCARD_MULTI_DRAW_SPLIT; - } - - /* Split single draws if splitting multi draws isn't enough. */ - struct pipe_draw_info split_draw = *info; - struct pipe_draw_start_count_bias split_draw_range = draws[0]; - unsigned base_start = split_draw_range.start; - unsigned count = draws[0].count; - - if (prim == PIPE_PRIM_TRIANGLES) { - assert(vert_count_per_subdraw < count); - - for (unsigned start = 0; start < count; start += vert_count_per_subdraw) { - split_draw_range.start = base_start + start; - split_draw_range.count = MIN2(count - start, vert_count_per_subdraw); - - sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1); - } - } else if (prim == PIPE_PRIM_TRIANGLE_STRIP) { - /* No primitive pair can be split, because strips reverse orientation - * for odd primitives. */ - STATIC_ASSERT(PRIMS_PER_BATCH % 2 == 0); - - for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) { - split_draw_range.start = base_start + start; - split_draw_range.count = MIN2(count - start, vert_count_per_subdraw + 2); - - sctx->b.draw_vbo(&sctx->b, &split_draw, drawid_offset, NULL, &split_draw_range, 1); - } - } - - return SI_PRIM_DISCARD_DRAW_SPLIT; - } - - /* Just quit if the draw call doesn't fit into the ring and can't be split. */ - if (out_indexbuf_size > sctx->index_ring_size_per_ib) { - if (SI_PRIM_DISCARD_DEBUG) - puts("PD failed: draw call too big, can't be split"); - return SI_PRIM_DISCARD_DISABLED; - } - - /* Compute how many CS dwords we need to reserve. */ - unsigned need_compute_dw = COMPUTE_PREAMBLE_SIZE + - 11 /* shader */ + - 30; /* leave some space at the end */ - unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx, 0); - - for (unsigned i = 0; i < num_draws; i++) { - unsigned num_subdraws = DIV_ROUND_UP(draws[i].count, PRIMS_PER_BATCH); - - need_compute_dw += 8 * num_subdraws + /* signal REWIND */ - 14 /* user SGPRs */ + - 4 * (num_subdraws - 1) + /* user SGPRs after the first subdraw */ - 11 * num_subdraws; - need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */ - } - - if (ring_full || - !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) { - /* If the current IB is empty but the size is too small, add a NOP - * packet to force a flush and get a bigger IB. - */ - if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) && - gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) { - radeon_begin(gfx_cs); - radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(gfx_cs, 0); - radeon_end(); - } - - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - } - - /* The compute IB is always chained, but we need to call cs_check_space to add more space. */ - struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs; - ASSERTED bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false); - assert(compute_has_space); - assert(si_check_ring_space(sctx, out_indexbuf_size)); - assert(cs->current.cdw + need_compute_dw <= cs->current.max_dw); - return SI_PRIM_DISCARD_ENABLED; -} - -void si_compute_signal_gfx(struct si_context *sctx) -{ - struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs; - unsigned writeback_L2_flags = 0; - - /* GFX8 needs to flush L2 for CP to see the updated vertex count. */ - if (sctx->chip_class == GFX8) - writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA; - - if (!sctx->compute_num_prims_in_batch) - return; - - assert(sctx->compute_rewind_va); - - /* After the queued dispatches are done and vertex counts are written to - * the gfx IB, signal the gfx IB to continue. CP doesn't wait for - * the dispatches to finish, it only adds the CS_DONE event into the event - * queue. - */ - si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags, - sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2, - writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM : EOP_INT_SEL_NONE, - EOP_DATA_SEL_VALUE_32BIT, NULL, - sctx->compute_rewind_va | ((uint64_t)sctx->screen->info.address32_hi << 32), - REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */ - SI_NOT_QUERY); - - sctx->compute_rewind_va = 0; - sctx->compute_num_prims_in_batch = 0; -} - -/* Dispatch a primitive discard compute shader. */ -void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - const struct pipe_draw_start_count_bias *draws, - unsigned num_draws, unsigned index_size, - unsigned total_count, uint64_t input_indexbuf_va, - unsigned index_max_size) -{ - struct radeon_cmdbuf *gfx_cs = &sctx->gfx_cs; - struct radeon_cmdbuf *cs = &sctx->prim_discard_compute_cs; - unsigned num_total_prims; - unsigned vertices_per_prim, output_indexbuf_format, gfx10_output_indexbuf_format; - - if (!info->instance_count) - return; - - switch (info->mode) { - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_TRIANGLE_STRIP: - if (info->mode == PIPE_PRIM_TRIANGLES) - num_total_prims = total_count / 3; - else if (total_count >= 2) - num_total_prims = total_count - 2; /* tri strip approximation ignoring multi draws */ - else - num_total_prims = 0; - - vertices_per_prim = 3; - output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32; - gfx10_output_indexbuf_format = V_008F0C_GFX10_FORMAT_32_32_32_UINT; - break; - default: - unreachable("unsupported primitive type"); - return; - } - - if (!num_total_prims) - return; - - unsigned out_indexbuf_offset; - uint64_t output_indexbuf_size = num_total_prims * vertices_per_prim * 4; - - /* Initialize the compute IB if it's empty. */ - if (!sctx->prim_discard_compute_ib_initialized) { - /* 1) State initialization. */ - sctx->compute_ib_last_shader = NULL; - - if (sctx->last_ib_barrier_fence) { - assert(!sctx->last_ib_barrier_buf); - sctx->ws->cs_add_fence_dependency(gfx_cs, sctx->last_ib_barrier_fence, - RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY); - } - - /* 2) IB initialization. */ - - /* This needs to be done at the beginning of IBs due to possible - * TTM buffer moves in the kernel. - */ - if (sctx->chip_class >= GFX10) { /* 8 DW */ - radeon_begin(cs); - radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 6, 0)); - radeon_emit(cs, 0); /* CP_COHER_CNTL */ - radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ - radeon_emit(cs, 0xffffff); /* CP_COHER_SIZE_HI */ - radeon_emit(cs, 0); /* CP_COHER_BASE */ - radeon_emit(cs, 0); /* CP_COHER_BASE_HI */ - radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ - radeon_emit(cs, /* GCR_CNTL */ - S_586_GLI_INV(V_586_GLI_ALL) | S_586_GLK_INV(1) | S_586_GLV_INV(1) | - S_586_GL1_INV(1) | S_586_GL2_INV(1) | S_586_GL2_WB(1) | S_586_GLM_INV(1) | - S_586_GLM_WB(1) | S_586_SEQ(V_586_SEQ_FORWARD)); - radeon_end(); - } else { - si_emit_surface_sync(sctx, cs, - S_0085F0_TC_ACTION_ENA(1) | S_0085F0_TCL1_ACTION_ENA(1) | - S_0301F0_TC_WB_ACTION_ENA(sctx->chip_class >= GFX8) | - S_0085F0_SH_ICACHE_ACTION_ENA(1) | - S_0085F0_SH_KCACHE_ACTION_ENA(1)); - } - - si_emit_initial_compute_regs(sctx, cs); /* 39 DW */ - - radeon_begin(cs); /* 11 DW */ - radeon_set_sh_reg( - cs, R_00B860_COMPUTE_TMPRING_SIZE, - S_00B860_WAVES(sctx->scratch_waves) | S_00B860_WAVESIZE(0)); /* no scratch */ - - /* Only 1D grids are launched. */ - radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2); - radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) | S_00B820_NUM_THREAD_PARTIAL(1)); - radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) | S_00B824_NUM_THREAD_PARTIAL(1)); - - radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2); - radeon_emit(cs, 0); - radeon_emit(cs, 0); - radeon_end(); - - if (sctx->last_ib_barrier_buf) { - assert(!sctx->last_ib_barrier_fence); - radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf, RADEON_USAGE_READ, - RADEON_PRIO_FENCE); - si_cp_wait_mem(sctx, cs, /* 7 DW */ - sctx->last_ib_barrier_buf->gpu_address + sctx->last_ib_barrier_buf_offset, - 1, 1, WAIT_REG_MEM_EQUAL); - } - - sctx->prim_discard_compute_ib_initialized = true; - assert(cs->current.cdw <= COMPUTE_PREAMBLE_SIZE); - } - - /* Allocate the output index buffer. */ - output_indexbuf_size = align(output_indexbuf_size, sctx->screen->info.tcc_cache_line_size); - assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib); - out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset; - sctx->index_ring_offset += output_indexbuf_size; - - radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE, - RADEON_PRIO_SHADER_RW_BUFFER); - uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset; - - /* Prepare index buffer descriptors. */ - struct si_resource *indexbuf_desc = NULL; - unsigned indexbuf_desc_offset; - unsigned desc_size = 12 * 4; - uint32_t *desc; - - u_upload_alloc(sctx->b.const_uploader, 0, desc_size, si_optimal_tcc_alignment(sctx, desc_size), - &indexbuf_desc_offset, (struct pipe_resource **)&indexbuf_desc, (void **)&desc); - radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ, - RADEON_PRIO_DESCRIPTORS); - - /* Input index buffer. */ - desc[0] = input_indexbuf_va; - desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) | S_008F04_STRIDE(index_size); - desc[2] = index_max_size * (sctx->chip_class == GFX8 ? index_size : 1); - - if (sctx->chip_class >= GFX10) { - desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | - S_008F0C_FORMAT(index_size == 1 ? V_008F0C_GFX10_FORMAT_8_UINT - : index_size == 2 ? V_008F0C_GFX10_FORMAT_16_UINT - : V_008F0C_GFX10_FORMAT_32_UINT) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[3] = - S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 - : index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 - : V_008F0C_BUF_DATA_FORMAT_32); - } - - /* Output index buffer. */ - desc[4] = out_indexbuf_va; - desc[5] = - S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) | S_008F04_STRIDE(vertices_per_prim * 4); - desc[6] = num_total_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1); - - if (sctx->chip_class >= GFX10) { - desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | - S_008F0C_FORMAT(gfx10_output_indexbuf_format) | - S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) | - S_008F0C_RESOURCE_LEVEL(1); - } else { - desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) | - S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(output_indexbuf_format); - } - - /* Viewport state. */ - struct si_small_prim_cull_info cull_info; - si_get_small_prim_cull_info(sctx, &cull_info); - - desc[8] = fui(cull_info.scale[0]); - desc[9] = fui(cull_info.scale[1]); - desc[10] = fui(cull_info.translate[0]); - desc[11] = fui(cull_info.translate[1]); - - /* Set user data SGPRs. */ - /* This can't be >= 16 if we want the fastest launch rate. */ - unsigned user_sgprs = 10; - - uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset; - unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX); - unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX); - uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address; - uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address; - uint64_t vb_desc_va = sctx->vb_descriptors_buffer - ? sctx->vb_descriptors_buffer->gpu_address + sctx->vb_descriptors_offset - : 0; - si_resource_reference(&indexbuf_desc, NULL); - - /* Set the compute shader. */ - struct si_shader *shader = sctx->cs_prim_discard_state.current; - - if (shader != sctx->compute_ib_last_shader) { - radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ, - RADEON_PRIO_SHADER_BINARY); - uint64_t shader_va = shader->bo->gpu_address; - - assert(shader->config.scratch_bytes_per_wave == 0); - assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4); - - radeon_begin(cs); - radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2); - radeon_emit(cs, shader_va >> 8); - radeon_emit(cs, S_00B834_DATA(shader_va >> 40)); - - radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2); - radeon_emit( - cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) | - S_00B848_SGPRS(sctx->chip_class <= GFX9 ? (shader->config.num_sgprs - 1) / 8 : 0) | - S_00B848_FLOAT_MODE(shader->config.float_mode) | S_00B848_DX10_CLAMP(1) | - S_00B848_MEM_ORDERED(sctx->chip_class >= GFX10) | - S_00B848_WGP_MODE(sctx->chip_class >= GFX10)); - radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) | S_00B84C_USER_SGPR(user_sgprs) | - S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) | - S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) | - S_00B84C_LDS_SIZE(shader->config.lds_size)); - - radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, - ac_get_compute_resource_limits(&sctx->screen->info, WAVES_PER_TG, - MAX_WAVES_PER_SH, THREADGROUPS_PER_CU)); - radeon_end(); - sctx->compute_ib_last_shader = shader; - } - - STATIC_ASSERT(PRIMS_PER_BATCH % THREADGROUP_SIZE == 0); - - for (unsigned i = 0; i < num_draws; i++) { - unsigned count = draws[i].count; - unsigned num_prims; - - /* Determine the number of primitives per draw. */ - if (info->mode == PIPE_PRIM_TRIANGLES) - num_prims = count / 3; - else if (count >= 2) - num_prims = count - 2; - else - num_prims = 0; - - if (!num_prims) - continue; - - /* Big draw calls are split into smaller dispatches and draw packets. */ - for (unsigned start_prim = 0; start_prim < num_prims; start_prim += PRIMS_PER_BATCH) { - unsigned num_subdraw_prims; - - if (start_prim + PRIMS_PER_BATCH < num_prims) { - num_subdraw_prims = PRIMS_PER_BATCH; - } else { - num_subdraw_prims = num_prims - start_prim; - } - - /* Small dispatches are executed back to back until a specific primitive - * count is reached. Then, a CS_DONE is inserted to signal the gfx IB - * to start drawing the batch. This batching adds latency to the gfx IB, - * but CS_DONE and REWIND are too slow. - */ - if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH) - si_compute_signal_gfx(sctx); - - if (sctx->compute_num_prims_in_batch == 0) { - assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi); - sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4; - - radeon_begin(gfx_cs); - radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0)); - radeon_emit(gfx_cs, 0); - radeon_end(); - } - - sctx->compute_num_prims_in_batch += num_subdraw_prims; - - uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4; - uint64_t index_va = out_indexbuf_va + start_prim * 12; - - /* Emit the draw packet into the gfx IB. */ - radeon_begin(gfx_cs); - radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0)); - radeon_emit(gfx_cs, num_subdraw_prims * vertices_per_prim); - radeon_emit(gfx_cs, index_va); - radeon_emit(gfx_cs, index_va >> 32); - radeon_emit(gfx_cs, 0); - radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA); - radeon_end(); - - radeon_begin_again(cs); - - /* Continue with the compute IB. */ - if (start_prim == 0) { - if (i == 0) { - /* First draw. */ - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs); - radeon_emit(cs, count_va); - radeon_emit(cs, start_prim); - radeon_emit(cs, draws[i].start); - radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start); - radeon_emit(cs, index_buffers_va); - radeon_emit(cs, vb_desc_va); - radeon_emit(cs, vs_const_desc_va); - radeon_emit(cs, vs_sampler_desc_va); - radeon_emit(cs, info->start_instance); - /* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */ - radeon_emit(cs, fui(cull_info.small_prim_precision)); - } else { - /* Subsequent draws. */ - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 4); - radeon_emit(cs, count_va); - radeon_emit(cs, 0); - radeon_emit(cs, draws[i].start); - radeon_emit(cs, index_size ? draws[i].index_bias : draws[i].start); - } - } else { - /* Draw split. Only update the SGPRs that changed. */ - radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2); - radeon_emit(cs, count_va); - radeon_emit(cs, start_prim); - } - - /* Set grid dimensions. */ - unsigned start_block = start_prim / THREADGROUP_SIZE; - unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE; - unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE; - - radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block); - radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X, - S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) | - S_00B81C_NUM_THREAD_PARTIAL(partial_block_size)); - - radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) | PKT3_SHADER_TYPE_S(1)); - radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size); - radeon_emit(cs, 1); - radeon_emit(cs, 1); - radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) | S_00B800_PARTIAL_TG_EN(!!partial_block_size) | - S_00B800_ORDER_MODE(0 /* launch in order */)); - radeon_end(); - - assert(cs->current.cdw <= cs->current.max_dw); - assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw); - } - } -} diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index b7aece56463..e0e0a669341 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -230,10 +230,8 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, sdst->TC_L2_dirty = true; /* If it's not a framebuffer fast clear... */ - if (coher == SI_COHERENCY_SHADER) { + if (coher == SI_COHERENCY_SHADER) sctx->num_cp_dma_calls++; - si_prim_discard_signal_next_compute_ib_start(sctx); - } } /** @@ -387,10 +385,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, si_resource(dst)->TC_L2_dirty = true; /* If it's not a prefetch or GDS copy... */ - if (dst && src && (dst != src || dst_offset != src_offset)) { + if (dst && src && (dst != src || dst_offset != src_offset)) sctx->num_cp_dma_calls++; - si_prim_discard_signal_next_compute_ib_start(sctx); - } } void si_cp_dma_prefetch(struct si_context *sctx, struct pipe_resource *buf, diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index f79f49b54dd..540206c1520 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -344,7 +344,6 @@ struct si_log_chunk_cs { struct si_saved_cs *cs; bool dump_bo_list; unsigned gfx_begin, gfx_end; - unsigned compute_begin, compute_end; }; static void si_log_chunk_type_cs_destroy(void *data) @@ -402,7 +401,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) struct si_context *ctx = chunk->ctx; struct si_saved_cs *scs = chunk->cs; int last_trace_id = -1; - int last_compute_trace_id = -1; /* We are expecting that the ddebug pipe has already * waited for the context, so this buffer should be idle. @@ -410,10 +408,8 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) */ uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL, PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ); - if (map) { + if (map) last_trace_id = map[0]; - last_compute_trace_id = map[1]; - } if (chunk->gfx_end != chunk->gfx_begin) { if (chunk->gfx_begin == 0) { @@ -435,20 +431,6 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f) } } - if (chunk->compute_end != chunk->compute_begin) { - assert(ctx->prim_discard_compute_cs.priv); - - if (scs->flushed) { - ac_parse_ib(f, scs->compute.ib + chunk->compute_begin, - chunk->compute_end - chunk->compute_begin, &last_compute_trace_id, map ? 1 : 0, - "Compute IB", ctx->chip_class, NULL, NULL); - } else { - si_parse_current_ib(f, &ctx->prim_discard_compute_cs, chunk->compute_begin, - chunk->compute_end, &last_compute_trace_id, map ? 1 : 0, "Compute IB", - ctx->chip_class); - } - } - if (chunk->dump_bo_list) { fprintf(f, "Flushing. Time: "); util_dump_ns(f, scs->time_flush); @@ -468,13 +450,8 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du struct si_saved_cs *scs = ctx->current_saved_cs; unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw; - unsigned compute_cur = 0; - if (ctx->prim_discard_compute_cs.priv) - compute_cur = - ctx->prim_discard_compute_cs.prev_dw + ctx->prim_discard_compute_cs.current.cdw; - - if (!dump_bo_list && gfx_cur == scs->gfx_last_dw && compute_cur == scs->compute_last_dw) + if (!dump_bo_list && gfx_cur == scs->gfx_last_dw) return; struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk)); @@ -487,10 +464,6 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool du chunk->gfx_end = gfx_cur; scs->gfx_last_dw = gfx_cur; - chunk->compute_begin = scs->compute_last_dw; - chunk->compute_end = compute_cur; - scs->compute_last_dw = compute_cur; - u_log_chunk(log, &si_log_chunk_type_cs, chunk); } diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index 7b82aa3abd3..d389a758d0f 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -73,7 +73,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigne EVENT_INDEX(event == V_028A90_CS_DONE || event == V_028A90_PS_DONE ? 6 : 5) | event_flags; unsigned sel = EOP_DST_SEL(dst_sel) | EOP_INT_SEL(int_sel) | EOP_DATA_SEL(data_sel); - bool compute_ib = !ctx->has_graphics || cs == &ctx->prim_discard_compute_cs; + bool compute_ib = !ctx->has_graphics; radeon_begin(cs); diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c index b9305e77115..02421e3970f 100644 --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c @@ -92,9 +92,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h ctx->gfx_flush_in_progress = true; - if (radeon_emitted(&ctx->prim_discard_compute_cs, 0)) - si_compute_signal_gfx(ctx); - if (ctx->has_graphics) { if (!list_is_empty(&ctx->active_queries)) si_suspend_queries(ctx); @@ -136,29 +133,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h si_log_hw_flush(ctx); } - if (si_compute_prim_discard_enabled(ctx)) { - /* The compute IB can start after the previous gfx IB starts. */ - if (radeon_emitted(&ctx->prim_discard_compute_cs, 0) && ctx->last_gfx_fence) { - ctx->ws->cs_add_fence_dependency( - &ctx->gfx_cs, ctx->last_gfx_fence, - RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY | RADEON_DEPENDENCY_START_FENCE); - } - - /* Remember the last execution barrier. It's in the IB. - * It will signal the start of the next compute IB. - */ - if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW && ctx->last_pkt3_write_data) { - *ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0); - ctx->last_pkt3_write_data = NULL; - - si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf); - ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset; - si_resource_reference(&ctx->barrier_buf, NULL); - - ws->fence_reference(&ctx->last_ib_barrier_fence, NULL); - } - } - if (ctx->is_noop) flags |= RADEON_FLUSH_NOOP; @@ -171,17 +145,6 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_h ctx->num_gfx_cs_flushes++; - if (si_compute_prim_discard_enabled(ctx)) { - /* Remember the last execution barrier, which is the last fence - * in this case. - */ - if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { - ctx->last_pkt3_write_data = NULL; - si_resource_reference(&ctx->last_ib_barrier_buf, NULL); - ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence); - } - } - /* Check VM faults if needed. */ if (sscreen->debug_flags & DBG(CHECK_VM)) { /* Use conservative timeout 800ms, after which we won't wait any @@ -216,7 +179,7 @@ static void si_begin_gfx_cs_debug(struct si_context *ctx) pipe_reference_init(&ctx->current_saved_cs->reference, 1); ctx->current_saved_cs->trace_buf = - si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8)); + si_resource(pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 4)); if (!ctx->current_saved_cs->trace_buf) { free(ctx->current_saved_cs); ctx->current_saved_cs = NULL; @@ -368,11 +331,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) bool is_secure = false; if (unlikely(radeon_uses_secure_bos(ctx->ws))) { - /* Disable features that don't work with TMZ: - * - primitive discard - */ - ctx->prim_discard_vertex_count_threshold = UINT_MAX; - is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs); si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble); @@ -549,18 +507,6 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs) assert(!ctx->gfx_cs.prev_dw); ctx->initial_gfx_cs_size = ctx->gfx_cs.current.cdw; - ctx->prim_discard_compute_ib_initialized = false; - - /* Compute-based primitive discard: - * The index ring is divided into 2 halves. Switch between the halves - * in the same fashion as doublebuffering. - */ - if (ctx->index_ring_base) - ctx->index_ring_base = 0; - else - ctx->index_ring_base = ctx->index_ring_size_per_ib; - - ctx->index_ring_offset = 0; /* All buffer references are removed on a flush, so si_check_needs_implicit_sync * cannot determine if si_make_CB_shader_coherent() needs to be called. @@ -586,34 +532,9 @@ void si_trace_emit(struct si_context *sctx) u_log_flush(sctx->log); } -void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx) -{ - if (!si_compute_prim_discard_enabled(sctx)) - return; - - if (!sctx->barrier_buf) { - u_suballocator_alloc(&sctx->allocator_zeroed_memory, 4, 4, &sctx->barrier_buf_offset, - (struct pipe_resource **)&sctx->barrier_buf); - } - - /* Emit a placeholder to signal the next compute IB to start. - * See si_compute_prim_discard.c for explanation. - */ - uint32_t signal = 1; - si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset, 4, V_370_MEM, V_370_ME, - &signal); - - sctx->last_pkt3_write_data = &sctx->gfx_cs.current.buf[sctx->gfx_cs.current.cdw - 5]; - - /* Only the last occurrence of WRITE_DATA will be executed. - * The packet will be enabled in si_flush_gfx_cs. - */ - *sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0); -} - void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl) { - bool compute_ib = !sctx->has_graphics || cs == &sctx->prim_discard_compute_cs; + bool compute_ib = !sctx->has_graphics; assert(sctx->chip_class <= GFX9); @@ -857,14 +778,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) uint32_t cp_coher_cntl = 0; const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB | SI_CONTEXT_FLUSH_AND_INV_DB); - const bool is_barrier = - flush_cb_db || - /* INV_ICACHE == beginning of gfx IB. Checking - * INV_ICACHE fixes corruption for DeusExMD with - * compute-based culling, but I don't know why. - */ - flags & (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_VS_PARTIAL_FLUSH) || - (flags & SI_CONTEXT_CS_PARTIAL_FLUSH && sctx->compute_is_busy); assert(sctx->chip_class <= GFX9); @@ -1077,9 +990,6 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs) radeon_end(); } - if (is_barrier) - si_prim_discard_signal_next_compute_ib_start(sctx); - if (flags & SI_CONTEXT_START_PIPELINE_STATS && sctx->pipeline_stats_enabled != 1) { radeon_begin(cs); radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index dee46810203..c52de041304 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -95,9 +95,6 @@ static const struct debug_named_value radeonsi_debug_options[] = { {"nggc", DBG(ALWAYS_NGG_CULLING_ALL), "Always use NGG culling even when it can hurt."}, {"nggctess", DBG(ALWAYS_NGG_CULLING_TESS), "Always use NGG culling for tessellation."}, {"nonggc", DBG(NO_NGG_CULLING), "Disable NGG culling."}, - {"alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader."}, - {"pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls."}, - {"nopd", DBG(NO_PD), "Disable the primitive discard compute shader."}, {"switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet."}, {"nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization"}, {"nodpbb", DBG(NO_DPBB), "Disable DPBB."}, @@ -309,12 +306,8 @@ static void si_destroy_context(struct pipe_context *context) u_suballocator_destroy(&sctx->allocator_zeroed_memory); sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL); - sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL); si_resource_reference(&sctx->eop_bug_scratch, NULL); si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL); - si_resource_reference(&sctx->index_ring, NULL); - si_resource_reference(&sctx->barrier_buf, NULL); - si_resource_reference(&sctx->last_ib_barrier_buf, NULL); si_resource_reference(&sctx->shadowed_regs, NULL); radeon_bo_reference(sctx->screen->ws, &sctx->gds, NULL); radeon_bo_reference(sctx->screen->ws, &sctx->gds_oa, NULL); @@ -618,12 +611,6 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign default: unreachable("unhandled chip class"); } - - si_initialize_prim_discard_tunables(sscreen, flags & SI_CONTEXT_FLAG_AUX, - &sctx->prim_discard_vertex_count_threshold, - &sctx->index_ring_size_per_ib); - } else { - sctx->prim_discard_vertex_count_threshold = UINT_MAX; } sctx->sample_mask = 0xffff; @@ -641,7 +628,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign sctx->b.create_video_buffer = vl_video_buffer_create; } - if (sctx->chip_class >= GFX9 || si_compute_prim_discard_enabled(sctx)) { + if (sctx->chip_class >= GFX9) { sctx->wait_mem_scratch = si_aligned_buffer_create(screen, SI_RESOURCE_FLAG_UNMAPPABLE | SI_RESOURCE_FLAG_DRIVER_INTERNAL, @@ -1167,15 +1154,10 @@ static struct pipe_screen *radeonsi_screen_create_impl(struct radeon_winsys *ws, sscreen->max_memory_usage_kb = sscreen->info.vram_size_kb + sscreen->info.gart_size_kb / 4 * 3; - unsigned prim_discard_vertex_count_threshold, tmp; - si_initialize_prim_discard_tunables(sscreen, false, &prim_discard_vertex_count_threshold, &tmp); - /* Compute-shader-based culling doesn't support VBOs in user SGPRs. */ - if (prim_discard_vertex_count_threshold == UINT_MAX) { - /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't - * have to allocate and count references for the upload buffer. - */ - sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1; - } + /* This decreases CPU overhead if all descriptors are in user SGPRs because we don't + * have to allocate and count references for the upload buffer. + */ + sscreen->num_vbos_in_user_sgprs = sscreen->info.chip_class >= GFX9 ? 5 : 1; /* Determine tessellation ring info. */ bool double_offchip_buffers = sscreen->info.chip_class >= GFX7 && diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 062fa6e34f7..44f160450f7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -44,7 +44,6 @@ extern "C" { #endif #define ATI_VENDOR_ID 0x1002 -#define SI_PRIM_DISCARD_DEBUG 0 #define SI_NOT_QUERY 0xffffffff /* The base vertex and primitive restart can be any number, but we must pick @@ -155,11 +154,6 @@ enum si_has_ngg { NGG_ON, }; -enum si_has_prim_discard_cs { - PRIM_DISCARD_CS_OFF, - PRIM_DISCARD_CS_ON, -}; - enum si_clear_code { DCC_CLEAR_COLOR_0000 = 0x00000000, @@ -223,9 +217,6 @@ enum DBG_ALWAYS_NGG_CULLING_TESS, DBG_NO_NGG_CULLING, DBG_NO_FAST_LAUNCH, - DBG_ALWAYS_PD, - DBG_PD, - DBG_NO_PD, DBG_SWITCH_ON_EOP, DBG_NO_OUT_OF_ORDER, DBG_NO_DPBB, @@ -896,7 +887,6 @@ struct si_saved_cs { unsigned trace_id; unsigned gfx_last_dw; - unsigned compute_last_dw; bool flushed; int64_t time_flush; }; @@ -995,26 +985,6 @@ struct si_context { /* NGG streamout. */ struct pb_buffer *gds; struct pb_buffer *gds_oa; - /* Compute-based primitive discard. */ - unsigned prim_discard_vertex_count_threshold; - struct radeon_cmdbuf prim_discard_compute_cs; - struct si_shader *compute_ib_last_shader; - uint32_t compute_rewind_va; - unsigned compute_num_prims_in_batch; - /* index_ring is divided into 2 halves for doublebuffering. */ - struct si_resource *index_ring; - unsigned index_ring_base; /* offset of a per-IB portion */ - unsigned index_ring_offset; /* offset within a per-IB portion */ - unsigned index_ring_size_per_ib; /* max available size per IB */ - bool prim_discard_compute_ib_initialized; - /* For tracking the last execution barrier - it can be either - * a WRITE_DATA packet or a fence. */ - uint32_t *last_pkt3_write_data; - struct si_resource *barrier_buf; - unsigned barrier_buf_offset; - struct pipe_fence_handle *last_ib_barrier_fence; - struct si_resource *last_ib_barrier_buf; - unsigned last_ib_barrier_buf_offset; /* Atoms (direct states). */ union si_state_atoms atoms; @@ -1063,7 +1033,6 @@ struct si_context { /* indexed access using pipe_shader_type (not by MESA_SHADER_*) */ struct si_shader_ctx_state shaders[SI_NUM_GRAPHICS_SHADERS]; }; - struct si_shader_ctx_state cs_prim_discard_state; struct si_cs_shader_state cs_shader_state; /* shader information */ @@ -1254,9 +1223,6 @@ struct si_context { unsigned num_resident_handles; uint64_t num_alloc_tex_transfer_bytes; unsigned last_tex_ps_draw_ratio; /* for query */ - unsigned compute_num_verts_accepted; - unsigned compute_num_verts_rejected; - unsigned compute_num_verts_ineligible; /* due to low vertex count */ unsigned context_roll; /* Queries. */ @@ -1287,7 +1253,7 @@ struct si_context { */ struct hash_table *dirty_implicit_resources; - pipe_draw_vbo_func draw_vbo[2][2][2][2]; + pipe_draw_vbo_func draw_vbo[2][2][2]; /* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */ pipe_draw_vbo_func real_draw_vbo; @@ -1483,7 +1449,6 @@ void si_allocate_gds(struct si_context *ctx); void si_set_tracked_regs_to_clear_state(struct si_context *ctx); void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs); void si_trace_emit(struct si_context *sctx); -void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx); void si_emit_surface_sync(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned cp_coher_cntl); void gfx10_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs); @@ -1502,32 +1467,6 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, uint64_t begin void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs); void si_init_compute_functions(struct si_context *sctx); -/* si_compute_prim_discard.c */ -enum si_prim_discard_outcome -{ - SI_PRIM_DISCARD_ENABLED, - SI_PRIM_DISCARD_DISABLED, - SI_PRIM_DISCARD_DRAW_SPLIT, - SI_PRIM_DISCARD_MULTI_DRAW_SPLIT, -}; - -void si_build_prim_discard_compute_shader(struct si_shader_context *ctx); -enum si_prim_discard_outcome -si_prepare_prim_discard_or_split_draw(struct si_context *sctx, const struct pipe_draw_info *info, - unsigned drawid_offset, - const struct pipe_draw_start_count_bias *draws, - unsigned num_draws, unsigned total_count); -void si_compute_signal_gfx(struct si_context *sctx); -void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, - const struct pipe_draw_info *info, - const struct pipe_draw_start_count_bias *draws, - unsigned num_draws, unsigned index_size, - unsigned total_count, uint64_t input_indexbuf_va, - unsigned index_max_size); -void si_initialize_prim_discard_tunables(struct si_screen *sscreen, bool is_aux_context, - unsigned *prim_discard_vertex_count_threshold, - unsigned *index_ring_size_per_ib); - /* si_pipe.c */ void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler); @@ -1996,14 +1935,9 @@ static inline void radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sc radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, bo, usage, priority); } -static inline bool si_compute_prim_discard_enabled(struct si_context *sctx) -{ - return sctx->prim_discard_vertex_count_threshold != UINT_MAX; -} - static inline unsigned si_get_wave_size(struct si_screen *sscreen, gl_shader_stage stage, bool ngg, bool es, - bool gs_fast_launch, bool prim_discard_cs) + bool gs_fast_launch) { if (stage == MESA_SHADER_COMPUTE) return sscreen->compute_wave_size; @@ -2011,8 +1945,7 @@ static inline unsigned si_get_wave_size(struct si_screen *sscreen, return sscreen->ps_wave_size; else if (gs_fast_launch) return 32; /* GS fast launch hangs with Wave64, so always use Wave32. */ - else if ((stage == MESA_SHADER_VERTEX && prim_discard_cs) || /* only Wave64 implemented */ - (stage == MESA_SHADER_VERTEX && es && !ngg) || + else if ((stage == MESA_SHADER_VERTEX && es && !ngg) || (stage == MESA_SHADER_TESS_EVAL && es && !ngg) || (stage == MESA_SHADER_GEOMETRY && !ngg)) /* legacy GS only supports Wave64 */ return 64; @@ -2025,18 +1958,14 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader) return si_get_wave_size(shader->selector->screen, shader->selector->info.stage, shader->key.as_ngg, shader->key.as_es, - shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL, - shader->key.opt.vs_as_prim_discard_cs); + shader->key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL); } static inline void si_select_draw_vbo(struct si_context *sctx) { - bool has_prim_discard_cs = si_compute_prim_discard_enabled(sctx) && - !sctx->shader.tes.cso && !sctx->shader.gs.cso; pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso] [!!sctx->shader.gs.cso] - [sctx->ngg] - [has_prim_discard_cs]; + [sctx->ngg]; assert(draw_vbo); if (unlikely(sctx->real_draw_vbo)) sctx->real_draw_vbo = draw_vbo; diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c index eba6af47a99..8908a56554e 100644 --- a/src/gallium/drivers/radeonsi/si_query.c +++ b/src/gallium/drivers/radeonsi/si_query.c @@ -260,15 +260,6 @@ static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery) case SI_QUERY_DISK_SHADER_CACHE_MISSES: query->begin_result = sctx->screen->num_disk_shader_cache_misses; break; - case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: - query->begin_result = sctx->compute_num_verts_accepted; - break; - case SI_QUERY_PD_NUM_PRIMS_REJECTED: - query->begin_result = sctx->compute_num_verts_rejected; - break; - case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE: - query->begin_result = sctx->compute_num_verts_ineligible; - break; case SI_QUERY_GPIN_ASIC_ID: case SI_QUERY_GPIN_NUM_SIMD: case SI_QUERY_GPIN_NUM_RB: @@ -429,15 +420,6 @@ static bool si_query_sw_end(struct si_context *sctx, struct si_query *squery) case SI_QUERY_DISK_SHADER_CACHE_MISSES: query->end_result = sctx->screen->num_disk_shader_cache_misses; break; - case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: - query->end_result = sctx->compute_num_verts_accepted; - break; - case SI_QUERY_PD_NUM_PRIMS_REJECTED: - query->end_result = sctx->compute_num_verts_rejected; - break; - case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE: - query->end_result = sctx->compute_num_verts_ineligible; - break; case SI_QUERY_GPIN_ASIC_ID: case SI_QUERY_GPIN_NUM_SIMD: case SI_QUERY_GPIN_NUM_RB: @@ -479,11 +461,6 @@ static bool si_query_sw_get_result(struct si_context *sctx, struct si_query *squ result->u64 = (query->end_result - query->begin_result) * 100 / (query->end_time - query->begin_time); return true; - case SI_QUERY_PD_NUM_PRIMS_ACCEPTED: - case SI_QUERY_PD_NUM_PRIMS_REJECTED: - case SI_QUERY_PD_NUM_PRIMS_INELIGIBLE: - result->u64 = ((unsigned)query->end_result - (unsigned)query->begin_result) / 3; - return true; case SI_QUERY_GPIN_ASIC_ID: result->u32 = 0; return true; @@ -1758,10 +1735,6 @@ static struct pipe_driver_query_info si_driver_query_list[] = { X("GPU-surf-sync-busy", GPU_SURF_SYNC_BUSY, UINT64, AVERAGE), X("GPU-cp-dma-busy", GPU_CP_DMA_BUSY, UINT64, AVERAGE), X("GPU-scratch-ram-busy", GPU_SCRATCH_RAM_BUSY, UINT64, AVERAGE), - - X("pd-num-prims-accepted", PD_NUM_PRIMS_ACCEPTED, UINT64, AVERAGE), - X("pd-num-prims-rejected", PD_NUM_PRIMS_REJECTED, UINT64, AVERAGE), - X("pd-num-prims-ineligible", PD_NUM_PRIMS_INELIGIBLE, UINT64, AVERAGE), }; #undef X diff --git a/src/gallium/drivers/radeonsi/si_query.h b/src/gallium/drivers/radeonsi/si_query.h index b1654106b13..b0e11373852 100644 --- a/src/gallium/drivers/radeonsi/si_query.h +++ b/src/gallium/drivers/radeonsi/si_query.h @@ -111,9 +111,6 @@ enum SI_QUERY_GPIN_NUM_RB, SI_QUERY_GPIN_NUM_SPI, SI_QUERY_GPIN_NUM_SE, - SI_QUERY_PD_NUM_PRIMS_ACCEPTED, - SI_QUERY_PD_NUM_PRIMS_REJECTED, - SI_QUERY_PD_NUM_PRIMS_INELIGIBLE, SI_QUERY_LIVE_SHADER_CACHE_HITS, SI_QUERY_LIVE_SHADER_CACHE_MISSES, SI_QUERY_MEMORY_SHADER_CACHE_HITS, diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index ee4ed59b096..92b3cb98fcc 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -419,12 +419,6 @@ void si_init_shader_args(struct si_shader_context *ctx, bool ngg_cull_shader) /* VGPRs */ declare_vs_input_vgprs(ctx, &num_prolog_vgprs); - - /* Return values */ - if (shader->key.opt.vs_as_prim_discard_cs) { - for (i = 0; i < 4; i++) - ac_add_return(&ctx->args, AC_ARG_VGPR); - } break; case MESA_SHADER_TESS_CTRL: /* GFX6-GFX8 */ @@ -1070,8 +1064,6 @@ const char *si_get_shader_name(const struct si_shader *shader) return "Vertex Shader as ES"; else if (shader->key.as_ls) return "Vertex Shader as LS"; - else if (shader->key.opt.vs_as_prim_discard_cs) - return "Vertex Shader as Primitive Discard CS"; else if (shader->key.as_ngg) return "Vertex Shader as ESGS"; else @@ -1183,12 +1175,6 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) fprintf(f, " as_ls = %u\n", key->as_ls); fprintf(f, " as_ngg = %u\n", key->as_ngg); fprintf(f, " mono.u.vs_export_prim_id = %u\n", key->mono.u.vs_export_prim_id); - fprintf(f, " opt.vs_as_prim_discard_cs = %u\n", key->opt.vs_as_prim_discard_cs); - fprintf(f, " opt.cs_prim_type = %s\n", tgsi_primitive_names[key->opt.cs_prim_type]); - fprintf(f, " opt.cs_indexed = %u\n", key->opt.cs_indexed); - fprintf(f, " opt.cs_provoking_vertex_first = %u\n", key->opt.cs_provoking_vertex_first); - fprintf(f, " opt.cs_cull_front = %u\n", key->opt.cs_cull_front); - fprintf(f, " opt.cs_cull_back = %u\n", key->opt.cs_cull_back); break; case MESA_SHADER_TESS_CTRL: @@ -1317,7 +1303,6 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_ key->vs_prolog.as_ls = shader_out->key.as_ls; key->vs_prolog.as_es = shader_out->key.as_es; key->vs_prolog.as_ngg = shader_out->key.as_ngg; - key->vs_prolog.as_prim_discard_cs = shader_out->key.opt.vs_as_prim_discard_cs; if (ngg_cull_shader) { key->vs_prolog.gs_fast_launch_tri_list = @@ -1342,8 +1327,7 @@ void si_get_vs_prolog_key(const struct si_shader_info *info, unsigned num_input_ /* Only one of these combinations can be set. as_ngg can be set with as_es. */ assert(key->vs_prolog.as_ls + key->vs_prolog.as_ngg + - (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) + key->vs_prolog.as_prim_discard_cs <= - 1); + (key->vs_prolog.as_es && !key->vs_prolog.as_ngg) <= 1); /* Enable loading the InstanceID VGPR. */ uint16_t input_mask = u_bit_consecutive(0, info->num_inputs); @@ -1557,7 +1541,6 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, (key->vs_prolog.gs_fast_launch_tri_list ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST : 0) | (key->vs_prolog.gs_fast_launch_tri_strip ? SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP : 0) | SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(key->vs_prolog.gs_fast_launch_index_size_packed); - shader.key.opt.vs_as_prim_discard_cs = key->vs_prolog.as_prim_discard_cs; break; case MESA_SHADER_TESS_CTRL: assert(!prolog); @@ -1581,8 +1564,7 @@ si_get_shader_part(struct si_screen *sscreen, struct si_shader_part **list, si_llvm_context_init(&ctx, sscreen, compiler, si_get_wave_size(sscreen, stage, shader.key.as_ngg, shader.key.as_es, - shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL, - shader.key.opt.vs_as_prim_discard_cs)); + shader.key.opt.ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)); ctx.shader = &shader; ctx.stage = stage; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 8b787185464..8ddaeaab7de 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -446,7 +446,6 @@ struct si_shader_selector { ubyte const_and_shader_buf_descriptors_index; ubyte sampler_and_images_descriptors_index; bool vs_needs_prolog; - bool prim_discard_cs_allowed; ubyte cs_shaderbufs_sgpr_index; ubyte cs_num_shaderbufs_in_user_sgprs; ubyte cs_images_sgpr_index; @@ -577,7 +576,6 @@ union si_shader_part_key { unsigned as_ls : 1; unsigned as_es : 1; unsigned as_ngg : 1; - unsigned as_prim_discard_cs : 1; unsigned gs_fast_launch_tri_list : 1; /* for NGG culling */ unsigned gs_fast_launch_tri_strip : 1; /* for NGG culling */ unsigned gs_fast_launch_index_size_packed : 2; @@ -684,14 +682,6 @@ struct si_shader_key { */ unsigned prefer_mono : 1; - /* Primitive discard compute shader. */ - unsigned vs_as_prim_discard_cs : 1; - unsigned cs_prim_type : 4; - unsigned cs_indexed : 1; - unsigned cs_provoking_vertex_first : 1; - unsigned cs_cull_front : 1; - unsigned cs_cull_back : 1; - /* VS and TCS have the same number of patch vertices. */ unsigned same_patch_vertices:1; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 8854584e059..c975581fe4f 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -804,9 +804,6 @@ void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef *part !same_thread_count && si_is_multi_part_shader(ctx->shader)) ac_build_endif(&ctx->ac, 6507); - /* Return the value from the last part. It's non-void only for the prim - * discard compute shader. - */ if (LLVMGetTypeKind(LLVMTypeOf(ret)) == LLVMVoidTypeKind) LLVMBuildRetVoid(builder); else @@ -1116,9 +1113,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * parts[num_parts++] = main_fn; si_build_wrapper_function(&ctx, parts, num_parts, first_is_prolog ? 1 : 0, 0, false); - - if (ctx.shader->key.opt.vs_as_prim_discard_cs) - si_build_prim_discard_compute_shader(&ctx); } else if (shader->is_monolithic && ctx.stage == MESA_SHADER_TESS_EVAL && ngg_cull_main_fn) { LLVMValueRef parts[3], prolog, main_fn = ctx.main_fn; @@ -1289,8 +1283,7 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * } /* Make sure the input is a pointer and not integer followed by inttoptr. */ - if (!shader->key.opt.vs_as_prim_discard_cs) - assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind); + assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) == LLVMPointerTypeKind); /* Compile to bytecode. */ if (!si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, &ctx.ac, debug, diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 0cfd441488a..b4a3b8a8aad 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -431,7 +431,7 @@ struct si_shader *si_generate_gs_copy_shader(struct si_screen *sscreen, si_llvm_context_init(&ctx, sscreen, compiler, si_get_wave_size(sscreen, MESA_SHADER_VERTEX, - false, false, false, false)); + false, false, false)); ctx.shader = shader; ctx.stage = MESA_SHADER_VERTEX; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index 73dff3f2203..b6bfa6fe09d 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -793,32 +793,6 @@ void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi) FREE(outputs); } -static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - struct si_shader_info *info = &ctx->shader->selector->info; - LLVMValueRef *addrs = abi->outputs; - LLVMValueRef pos[4] = {}; - - assert(info->num_outputs <= AC_LLVM_MAX_OUTPUTS); - - for (unsigned i = 0; i < info->num_outputs; i++) { - if (info->output_semantic[i] != VARYING_SLOT_POS) - continue; - - for (unsigned chan = 0; chan < 4; chan++) - pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); - break; - } - assert(pos[0] != NULL); - - /* Return the position output. */ - LLVMValueRef ret = ctx->return_value; - for (unsigned chan = 0; chan < 4; chan++) - ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, ""); - ctx->return_value = ret; -} - /** * Build the vertex shader prolog function. * @@ -1121,8 +1095,6 @@ void si_llvm_init_vs_callbacks(struct si_shader_context *ctx, bool ngg_cull_shad ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue; else if (shader->key.as_es) ctx->abi.emit_outputs = si_llvm_emit_es_epilogue; - else if (shader->key.opt.vs_as_prim_discard_cs) - ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue; else if (ngg_cull_shader) ctx->abi.emit_outputs = gfx10_emit_ngg_culling_epilogue; else if (shader->key.as_ngg) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 9646fc36195..f4a6ef77b25 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -971,7 +971,7 @@ static void si_emit_draw_registers(struct si_context *sctx, } \ } while (0) -template +template ALWAYS_INLINE static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info, unsigned drawid_base, @@ -980,7 +980,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw unsigned num_draws, unsigned total_count, struct pipe_resource *indexbuf, unsigned index_size, unsigned index_offset, unsigned instance_count, - bool dispatch_prim_discard_cs, unsigned original_index_size) + unsigned original_index_size) { struct radeon_cmdbuf *cs = &sctx->gfx_cs; @@ -1042,22 +1042,19 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw sctx->last_index_size = index_size; } - /* If !ALLOW_PRIM_DISCARD_CS, index_size == original_index_size. */ - if (!ALLOW_PRIM_DISCARD_CS || original_index_size) { - index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size); - /* Skip draw calls with 0-sized index buffers. - * They cause a hang on some chips, like Navi10-14. - */ - if (!index_max_size) { - radeon_end(); - return; - } - - index_va = si_resource(indexbuf)->gpu_address + index_offset; - - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ, - RADEON_PRIO_INDEX_BUFFER); + index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(index_size); + /* Skip draw calls with 0-sized index buffers. + * They cause a hang on some chips, like Navi10-14. + */ + if (!index_max_size) { + radeon_end(); + return; } + + index_va = si_resource(indexbuf)->gpu_address + index_offset; + + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf), RADEON_USAGE_READ, + RADEON_PRIO_INDEX_BUFFER); } else { /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, * so the state must be re-emitted before the next indexed draw. @@ -1190,16 +1187,6 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw bool increment_draw_id = num_draws > 1 && set_draw_id && info->increment_draw_id; if (index_size) { - if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) { - radeon_end(); - - si_dispatch_prim_discard_cs_and_draw(sctx, info, draws, num_draws, - original_index_size, total_count, index_va, - index_max_size); - EMIT_SQTT_END_DRAW; - return; - } - /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs * can be changed between draws, and GS fast launch must be disabled. * NOT_EOP doesn't work on gfx9 and older. @@ -1629,100 +1616,12 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i info->restart_index, min_vertex_count); } -static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf) -{ - struct radeon_winsys *ws = sctx->ws; - struct radeon_cmdbuf *cs = &sctx->gfx_cs; - struct si_descriptors *buffers = - &sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)]; - struct si_shader_selector *vs = sctx->shader.vs.cso; - struct si_vertex_elements *velems = sctx->vertex_elements; - unsigned num_velems = velems->count; - unsigned num_images = vs->info.base.num_images; - - /* Index buffer. */ - if (indexbuf && ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf, RADEON_USAGE_WRITE)) - goto has_write_reference; - - /* Vertex buffers. */ - for (unsigned i = 0; i < num_velems; i++) { - if (!((1 << i) & velems->first_vb_use_mask)) - continue; - - unsigned vb_index = velems->vertex_buffer_index[i]; - struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource; - if (!res) - continue; - - if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE)) - goto has_write_reference; - } - - /* Constant and shader buffers. */ - for (unsigned i = 0; i < buffers->num_active_slots; i++) { - unsigned index = buffers->first_active_slot + i; - struct pipe_resource *res = sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index]; - if (!res) - continue; - - if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE)) - goto has_write_reference; - } - - /* Samplers. */ - if (vs->info.base.textures_used[0]) { - unsigned num_samplers = BITSET_LAST_BIT(vs->info.base.textures_used); - - for (unsigned i = 0; i < num_samplers; i++) { - struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i]; - if (!view) - continue; - - if (ws->cs_is_buffer_referenced(cs, si_resource(view->texture)->buf, RADEON_USAGE_WRITE)) - goto has_write_reference; - } - } - - /* Images. */ - if (num_images) { - for (unsigned i = 0; i < num_images; i++) { - struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource; - if (!res) - continue; - - if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf, RADEON_USAGE_WRITE)) - goto has_write_reference; - } - } - - return true; - -has_write_reference: - /* If the current gfx IB has enough packets, flush it to remove write - * references to buffers. - */ - if (cs->prev_dw + cs->current.cdw > 2048) { - si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); - assert(si_all_vs_resources_read_only(sctx, indexbuf)); - return true; - } - return false; -} - -static ALWAYS_INLINE bool pd_msg(const char *s) -{ - if (SI_PRIM_DISCARD_DEBUG) - printf("PD failed: %s\n", s); - return false; -} - #define DRAW_CLEANUP do { \ if (index_size && indexbuf != info->index.resource) \ pipe_resource_reference(&indexbuf, NULL); \ } while (0) -template +template static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info, unsigned drawid_offset, @@ -1910,70 +1809,8 @@ static void si_draw_vbo(struct pipe_context *ctx, info->primitive_restart && (!sctx->screen->options.prim_restart_tri_strips_only || (prim != PIPE_PRIM_TRIANGLE_STRIP && prim != PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY)); - bool dispatch_prim_discard_cs = false; unsigned original_index_size = index_size; - /* Determine if we can use the primitive discard compute shader. */ - /* TODO: this requires that primitives can be drawn out of order, so check depth/stencil/blend states. */ - if (ALLOW_PRIM_DISCARD_CS && - (total_direct_count > sctx->prim_discard_vertex_count_threshold - ? (sctx->compute_num_verts_rejected += total_direct_count, true) - : /* Add, then return true. */ - (sctx->compute_num_verts_ineligible += total_direct_count, - false)) && /* Add, then return false. */ - (!primitive_restart || pd_msg("primitive restart")) && - /* Supported prim types. */ - (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) | (1 << PIPE_PRIM_TRIANGLE_STRIP)) && - (instance_count == 1 || pd_msg("instancing")) && - ((drawid_offset == 0 && (num_draws == 1 || !info->increment_draw_id)) || - !sctx->shader.vs.cso->info.uses_drawid || pd_msg("draw_id > 0")) && - (!sctx->render_cond || pd_msg("render condition")) && - /* Forced enablement ignores pipeline statistics queries. */ - (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) || - (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) || - pd_msg("pipestat or primgen query")) && - (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) && - (!sctx->shader.ps.cso->info.uses_primid || pd_msg("PS uses PrimID")) && - !rs->polygon_mode_enabled && -#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */ - (!sctx->shader.vs.cso->info.uses_bindless_images || pd_msg("uses bindless images")) && - (!sctx->shader.vs.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) && - (!sctx->shader.vs.cso->info.base.writes_memory || pd_msg("writes memory")) && - (!sctx->shader.vs.cso->info.writes_viewport_index || pd_msg("writes viewport index")) && - !sctx->shader.vs.cso->info.base.vs.window_space_position && - !sctx->shader.vs.cso->so.num_outputs && -#else - (sctx->shader.vs.cso->prim_discard_cs_allowed || - pd_msg("VS shader uses unsupported features")) && -#endif - /* Check that all buffers are used for read only, because compute - * dispatches can run ahead. */ - (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || - pd_msg("write reference"))) { - switch (si_prepare_prim_discard_or_split_draw(sctx, info, drawid_offset, draws, num_draws, - total_direct_count)) { - case SI_PRIM_DISCARD_ENABLED: - original_index_size = index_size; - dispatch_prim_discard_cs = true; - - /* The compute shader changes/lowers the following: */ - prim = PIPE_PRIM_TRIANGLES; - index_size = 4; - instance_count = 1; - sctx->compute_num_verts_rejected -= total_direct_count; - sctx->compute_num_verts_accepted += total_direct_count; - break; - case SI_PRIM_DISCARD_DISABLED: - break; - case SI_PRIM_DISCARD_DRAW_SPLIT: - case SI_PRIM_DISCARD_MULTI_DRAW_SPLIT: - sctx->compute_num_verts_rejected -= total_direct_count; - /* The multi draw was split into multiple ones and executed. Return. */ - DRAW_CLEANUP; - return; - } - } - /* Set the rasterization primitive type. * * This must be done after si_decompress_textures, which can call @@ -2005,7 +1842,7 @@ static void si_draw_vbo(struct pipe_context *ctx, if (GFX_VERSION >= GFX10) { struct si_shader_selector *hw_vs = si_get_vs_inline(sctx, HAS_TESS, HAS_GS)->cso; - if (NGG && !HAS_GS && !dispatch_prim_discard_cs && + if (NGG && !HAS_GS && /* Tessellation sets ngg_cull_vert_threshold to UINT_MAX if the prim type * is not triangles, so this check is only needed without tessellation. */ (HAS_TESS || sctx->current_rast_prim == PIPE_PRIM_TRIANGLES) && @@ -2154,10 +1991,9 @@ static void si_draw_vbo(struct pipe_context *ctx, } assert(sctx->dirty_atoms == 0); - si_emit_draw_packets + si_emit_draw_packets (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf, - index_size, index_offset, instance_count, dispatch_prim_discard_cs, - original_index_size); + index_size, index_offset, instance_count, original_index_size); /* <-- CUs are busy here. */ /* Start prefetches after the draw has been started. Both will run @@ -2193,10 +2029,9 @@ static void si_draw_vbo(struct pipe_context *ctx, } assert(sctx->dirty_atoms == 0); - si_emit_draw_packets + si_emit_draw_packets (sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf, - index_size, index_offset, instance_count, dispatch_prim_discard_cs, - original_index_size); + index_size, index_offset, instance_count, original_index_size); /* Prefetch the remaining shaders after the draw has been * started. */ @@ -2281,40 +2116,27 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem pipe->draw_vbo(pipe, &info, 0, NULL, &draw, 1); } -template +template static void si_init_draw_vbo(struct si_context *sctx) { - /* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */ - if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX8) - return; - - if (ALLOW_PRIM_DISCARD_CS && (HAS_TESS || HAS_GS)) - return; - if (NGG && GFX_VERSION < GFX10) return; - sctx->draw_vbo[HAS_TESS][HAS_GS][NGG][ALLOW_PRIM_DISCARD_CS] = - si_draw_vbo; -} - -template -static void si_init_draw_vbo_all_internal_options(struct si_context *sctx) -{ - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); - si_init_draw_vbo(sctx); + sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] = + si_draw_vbo; } template static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx) { - si_init_draw_vbo_all_internal_options(sctx); - si_init_draw_vbo_all_internal_options(sctx); - si_init_draw_vbo_all_internal_options(sctx); - si_init_draw_vbo_all_internal_options(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); + si_init_draw_vbo(sctx); } static void si_invalid_draw_vbo(struct pipe_context *pipe, diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c index 5412a87f0a1..8ffe2901970 100644 --- a/src/gallium/drivers/radeonsi/si_state_msaa.c +++ b/src/gallium/drivers/radeonsi/si_state_msaa.c @@ -81,8 +81,8 @@ * Right half: {1,3,5,7,9,11,13,15} */ -/* Important note: We have to use the standard DX positions, because - * the primitive discard compute shader relies on them. +/* Important note: We have to use the standard DX positions because shader-based culling + * relies on them. */ /* 1x MSAA */ diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 00352ddeae3..dc1d5f795a6 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -70,7 +70,7 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, shader_variant_flags |= 1 << 0; if (sel->nir) shader_variant_flags |= 1 << 1; - if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false, false) == 32) + if (si_get_wave_size(sel->screen, sel->info.stage, ngg, es, false) == 32) shader_variant_flags |= 1 << 2; if (sel->info.stage == MESA_SHADER_FRAGMENT && /* Derivatives imply helper invocations so check for needs_quad_helper_invocations. */ @@ -78,11 +78,9 @@ void si_get_ir_cache_key(struct si_shader_selector *sel, bool ngg, bool es, sel->info.base.fs.uses_discard && sel->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) shader_variant_flags |= 1 << 3; - if (sel->info.stage == MESA_SHADER_VERTEX) { - /* This varies depending on whether compute-based culling is enabled. */ - assert(sel->screen->num_vbos_in_user_sgprs <= 7); - shader_variant_flags |= MIN2(sel->screen->num_vbos_in_user_sgprs, 7) << 4; - } + + /* bit gap */ + if (sel->screen->options.no_infinite_interp) shader_variant_flags |= 1 << 7; if (sel->screen->options.clamp_div_by_zero) @@ -2291,10 +2289,8 @@ current_not_ready: /* Compile the main shader part if it doesn't exist. This can happen * if the initial guess was wrong. - * - * The prim discard CS doesn't need the main shader part. */ - if (!is_pure_monolithic && !key->opt.vs_as_prim_discard_cs) { + if (!is_pure_monolithic) { bool ok = true; /* Make sure the main shader part is present. This is needed @@ -2348,8 +2344,7 @@ current_not_ready: shader->is_monolithic = is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; - /* The prim discard CS is always optimized. */ - shader->is_optimized = (!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) && + shader->is_optimized = !is_pure_monolithic && memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; /* If it's an optimized shader, compile it asynchronously. */ @@ -2706,12 +2701,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->vs_needs_prolog = sel->info.stage == MESA_SHADER_VERTEX && sel->info.num_inputs && !sel->info.base.vs.blit_sgprs_amd; - sel->prim_discard_cs_allowed = - sel->info.stage == MESA_SHADER_VERTEX && !sel->info.uses_bindless_images && - !sel->info.uses_bindless_samplers && !sel->info.base.writes_memory && - !sel->info.writes_viewport_index && - !sel->info.base.vs.window_space_position && !sel->so.num_outputs; - if (sel->info.stage == MESA_SHADER_VERTEX || sel->info.stage == MESA_SHADER_TESS_CTRL || sel->info.stage == MESA_SHADER_TESS_EVAL || diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 0792055ccca..747fe281d27 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -771,9 +771,6 @@ static unsigned amdgpu_ib_max_submit_dwords(enum ib_type ib_type) * http://www.phoronix.com/scan.php?page=article&item=mesa-111-si&num=1 */ return 20 * 1024; - case IB_PARALLEL_COMPUTE: - /* Always chain this IB. */ - return UINT_MAX; default: unreachable("bad ib_type"); } @@ -908,9 +905,6 @@ static bool amdgpu_init_cs_context(struct amdgpu_winsys *ws, assert(0); } - cs->ib[IB_PARALLEL_COMPUTE].ip_type = AMDGPU_HW_IP_COMPUTE; - cs->ib[IB_PARALLEL_COMPUTE].flags = AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE; - cs->last_added_bo = NULL; return true; } @@ -938,8 +932,6 @@ static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *ws, struct amdgpu_cs cleanup_fence_list(&cs->fence_dependencies); cleanup_fence_list(&cs->syncobj_dependencies); cleanup_fence_list(&cs->syncobj_to_signal); - cleanup_fence_list(&cs->compute_fence_dependencies); - cleanup_fence_list(&cs->compute_start_fence_dependencies); cs->num_real_buffers = 0; cs->num_slab_buffers = 0; @@ -957,8 +949,6 @@ static void amdgpu_destroy_cs_context(struct amdgpu_winsys *ws, struct amdgpu_cs FREE(cs->fence_dependencies.list); FREE(cs->syncobj_dependencies.list); FREE(cs->syncobj_to_signal.list); - FREE(cs->compute_fence_dependencies.list); - FREE(cs->compute_start_fence_dependencies.list); } @@ -997,7 +987,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs, amdgpu_cs_chunk_fence_info_to_data(&fence_info, (void*)&cs->fence_chunk); cs->main.ib_type = IB_MAIN; - cs->compute_ib.ib_type = IB_PARALLEL_COMPUTE; if (!amdgpu_init_cs_context(ctx->ws, &cs->csc1, ring_type)) { FREE(cs); @@ -1035,37 +1024,6 @@ amdgpu_cs_create(struct radeon_cmdbuf *rcs, return true; } -static bool -amdgpu_cs_add_parallel_compute_ib(struct radeon_cmdbuf *compute_cs, - struct radeon_cmdbuf *gfx_cs, - bool uses_gds_ordered_append) -{ - struct amdgpu_cs *cs = amdgpu_cs(gfx_cs); - struct amdgpu_winsys *ws = cs->ws; - - if (cs->ring_type != RING_GFX) - return false; - - /* only one secondary IB can be added */ - if (cs->compute_ib.ib_mapped) - return false; - - /* Allocate the compute IB. */ - if (!amdgpu_get_new_ib(ws, compute_cs, &cs->compute_ib, cs)) - return false; - - if (uses_gds_ordered_append) { - cs->csc1.ib[IB_PARALLEL_COMPUTE].flags |= - AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID; - cs->csc2.ib[IB_PARALLEL_COMPUTE].flags |= - AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID; - } - - cs->compute_ib.rcs = compute_cs; - compute_cs->priv = cs; - return true; -} - static bool amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib, unsigned preamble_num_dw) @@ -1128,7 +1086,7 @@ static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw, bool force_chaining) { struct amdgpu_cs *cs = amdgpu_cs(rcs); - struct amdgpu_ib *ib = rcs == cs->main.rcs ? &cs->main : &cs->compute_ib; + struct amdgpu_ib *ib = &cs->main; unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs); unsigned need_byte_size = (dw + cs_epilog_dw) * 4; @@ -1286,18 +1244,6 @@ static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws, util_queue_fence_wait(&fence->submitted); - if (dependency_flags & RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY) { - /* Syncobjs are not needed here. */ - assert(!amdgpu_fence_is_syncobj(fence)); - - if (acs->ws->info.has_scheduled_fence_dependency && - dependency_flags & RADEON_DEPENDENCY_START_FENCE) - add_fence_to_list(&cs->compute_start_fence_dependencies, fence); - else - add_fence_to_list(&cs->compute_fence_dependencies, fence); - return; - } - /* Start fences are not needed here. */ assert(!(dependency_flags & RADEON_DEPENDENCY_START_FENCE)); @@ -1589,66 +1535,6 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index) num_chunks++; } - /* Submit the parallel compute IB first. */ - if (cs->ib[IB_PARALLEL_COMPUTE].ib_bytes > 0) { - unsigned old_num_chunks = num_chunks; - - /* Add compute fence dependencies. */ - unsigned num_dependencies = cs->compute_fence_dependencies.num; - if (num_dependencies) { - struct drm_amdgpu_cs_chunk_dep *dep_chunk = - alloca(num_dependencies * sizeof(*dep_chunk)); - - for (unsigned i = 0; i < num_dependencies; i++) { - struct amdgpu_fence *fence = - (struct amdgpu_fence*)cs->compute_fence_dependencies.list[i]; - - assert(util_queue_fence_is_signalled(&fence->submitted)); - amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]); - } - - chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES; - chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_dependencies; - chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk; - num_chunks++; - } - - /* Add compute start fence dependencies. */ - unsigned num_start_dependencies = cs->compute_start_fence_dependencies.num; - if (num_start_dependencies) { - struct drm_amdgpu_cs_chunk_dep *dep_chunk = - alloca(num_start_dependencies * sizeof(*dep_chunk)); - - for (unsigned i = 0; i < num_start_dependencies; i++) { - struct amdgpu_fence *fence = - (struct amdgpu_fence*)cs->compute_start_fence_dependencies.list[i]; - - assert(util_queue_fence_is_signalled(&fence->submitted)); - amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[i]); - } - - chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES; - chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num_start_dependencies; - chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk; - num_chunks++; - } - - /* Convert from dwords to bytes. */ - cs->ib[IB_PARALLEL_COMPUTE].ib_bytes *= 4; - chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB; - chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4; - chunks[num_chunks].chunk_data = (uintptr_t)&cs->ib[IB_PARALLEL_COMPUTE]; - num_chunks++; - - r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list, - num_chunks, chunks, NULL); - if (r) - goto finalize; - - /* Back off the compute chunks. */ - num_chunks = old_num_chunks; - } - /* Syncobj signals. */ unsigned num_syncobj_to_signal = cs->syncobj_to_signal.num; if (num_syncobj_to_signal) { @@ -1706,7 +1592,7 @@ static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index) r = acs->noop ? 0 : amdgpu_cs_submit_raw2(ws->dev, acs->ctx->ctx, bo_list, num_chunks, chunks, &seq_no); } -finalize: + if (r) { if (r == -ENOMEM) fprintf(stderr, "amdgpu: Not enough memory for command submission.\n"); @@ -1798,12 +1684,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, } if (cs->ring_type == RING_GFX) ws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4; - - /* Also pad secondary IBs. */ - if (cs->compute_ib.ib_mapped) { - while (cs->compute_ib.rcs->current.cdw & ib_pad_dw_mask) - radeon_emit(cs->compute_ib.rcs, PKT3_NOP_PAD); - } break; case RING_UVD: case RING_UVD_ENC: @@ -1839,9 +1719,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, /* Set IB sizes. */ amdgpu_ib_finalize(ws, rcs, &cs->main); - if (cs->compute_ib.ib_mapped) - amdgpu_ib_finalize(ws, cs->compute_ib.rcs, &cs->compute_ib); - /* Create a fence. */ amdgpu_fence_reference(&cur->fence, NULL); if (cs->next_fence) { @@ -1897,8 +1774,6 @@ static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs, memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist)); amdgpu_get_new_ib(ws, rcs, &cs->main, cs); - if (cs->compute_ib.ib_mapped) - amdgpu_get_new_ib(ws, cs->compute_ib.rcs, &cs->compute_ib, cs); if (cs->preamble_ib_bo) { amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo, RADEON_USAGE_READ, 0, @@ -1929,9 +1804,6 @@ static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs) radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->preamble_ib_bo, NULL); radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->main.big_ib_buffer, NULL); FREE(rcs->prev); - radeon_bo_reference(&cs->ws->dummy_ws.base, &cs->compute_ib.big_ib_buffer, NULL); - if (cs->compute_ib.rcs) - FREE(cs->compute_ib.rcs->prev); amdgpu_destroy_cs_context(cs->ws, &cs->csc1); amdgpu_destroy_cs_context(cs->ws, &cs->csc2); amdgpu_fence_reference(&cs->next_fence, NULL); @@ -1954,7 +1826,6 @@ void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *ws) ws->base.ctx_destroy = amdgpu_ctx_destroy; ws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status; ws->base.cs_create = amdgpu_cs_create; - ws->base.cs_add_parallel_compute_ib = amdgpu_cs_add_parallel_compute_ib; ws->base.cs_setup_preemption = amdgpu_cs_setup_preemption; ws->base.cs_destroy = amdgpu_cs_destroy; ws->base.cs_add_buffer = amdgpu_cs_add_buffer; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index 77bde4a070b..4568a6e9b1f 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -58,7 +58,6 @@ struct amdgpu_cs_buffer { enum ib_type { IB_PREAMBLE, IB_MAIN, - IB_PARALLEL_COMPUTE, IB_NUM, }; @@ -115,10 +114,6 @@ struct amdgpu_cs_context { struct amdgpu_fence_list syncobj_dependencies; struct amdgpu_fence_list syncobj_to_signal; - /* The compute IB uses the dependencies above + these: */ - struct amdgpu_fence_list compute_fence_dependencies; - struct amdgpu_fence_list compute_start_fence_dependencies; - struct pipe_fence_handle *fence; /* the error returned from cs_flush for non-async submissions */ @@ -132,7 +127,6 @@ struct amdgpu_cs_context { struct amdgpu_cs { struct amdgpu_ib main; /* must be first because this is inherited */ - struct amdgpu_ib compute_ib; /* optional parallel compute IB */ struct amdgpu_winsys *ws; struct amdgpu_ctx *ctx; enum ring_type ring_type;