diff --git a/meson.build b/meson.build index b93077ed837..ade6546b0af 100644 --- a/meson.build +++ b/meson.build @@ -299,7 +299,7 @@ if ['x86_64'].contains(host_machine.cpu_family()) and \ get_option('intel-clc') != 'system' # Require intel-clc with Anv & Iris (for internal shaders) with_intel_clc = get_option('intel-clc') == 'enabled' or \ - with_intel_vk + with_intel_vk or with_gallium_iris else with_intel_clc = false endif diff --git a/src/gallium/drivers/iris/driinfo_iris.h b/src/gallium/drivers/iris/driinfo_iris.h index aead4bc1186..d7310993a01 100644 --- a/src/gallium/drivers/iris/driinfo_iris.h +++ b/src/gallium/drivers/iris/driinfo_iris.h @@ -13,6 +13,7 @@ DRI_CONF_SECTION_PERFORMANCE DRI_CONF_ADAPTIVE_SYNC(true) DRI_CONF_OPT_E(bo_reuse, 1, 0, 1, "Buffer object reuse",) DRI_CONF_OPT_B(intel_tbimr, true, "Enable TBIMR tiled rendering") + DRI_CONF_OPT_I(generated_indirect_threshold, 100, 0, INT32_MAX, "Generated indirect draw threshold") DRI_CONF_SECTION_END DRI_CONF_SECTION_QUALITY diff --git a/src/gallium/drivers/iris/iris_batch.h b/src/gallium/drivers/iris/iris_batch.h index f0cfe4fb031..29cb3a93d2c 100644 --- a/src/gallium/drivers/iris/iris_batch.h +++ b/src/gallium/drivers/iris/iris_batch.h @@ -243,6 +243,12 @@ iris_batch_bytes_used(struct iris_batch *batch) return batch->map_next - batch->map; } +static inline uint64_t +iris_batch_current_address_u64(struct iris_batch *batch) +{ + return batch->bo->address + (batch->map_next - batch->map); +} + /** * Ensure the current command buffer has \param size bytes of space * remaining. If not, this creates a secondary batch buffer and emits diff --git a/src/gallium/drivers/iris/iris_binder.c b/src/gallium/drivers/iris/iris_binder.c index 4d821ee7c18..2d5c3f05a51 100644 --- a/src/gallium/drivers/iris/iris_binder.c +++ b/src/gallium/drivers/iris/iris_binder.c @@ -119,6 +119,23 @@ iris_binder_reserve(struct iris_context *ice, return binder_insert(binder, size); } +/** + * Reserve and record binder space for generation shader (FS stage only). + */ +void +iris_binder_reserve_gen(struct iris_context *ice) +{ + struct iris_binder *binder = &ice->state.binder; + + binder->bt_offset[MESA_SHADER_FRAGMENT] = + iris_binder_reserve(ice, sizeof(uint32_t)); + + iris_record_state_size(ice->state.sizes, + binder->bo->address + + binder->bt_offset[MESA_SHADER_FRAGMENT], + sizeof(uint32_t)); +} + /** * Reserve and record binder space for 3D pipeline shader stages. * diff --git a/src/gallium/drivers/iris/iris_binder.h b/src/gallium/drivers/iris/iris_binder.h index 78d38d162f3..8f17df2c250 100644 --- a/src/gallium/drivers/iris/iris_binder.h +++ b/src/gallium/drivers/iris/iris_binder.h @@ -59,6 +59,7 @@ void iris_init_binder(struct iris_context *ice); void iris_destroy_binder(struct iris_binder *binder); uint32_t iris_binder_reserve(struct iris_context *ice, unsigned size); void iris_binder_reserve_3d(struct iris_context *ice); +void iris_binder_reserve_gen(struct iris_context *ice); void iris_binder_reserve_compute(struct iris_context *ice); #endif diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index 2ce50c96d07..1ae30663502 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -714,6 +714,28 @@ struct iris_context { * drawid and is_indexed_draw. They will go in their own vertex element. */ struct iris_state_ref derived_draw_params; + + struct { + /** + * Generation fragment shader + */ + struct iris_compiled_shader *shader; + + /** + * Ring buffer where to generate indirect draw commands + */ + struct iris_bo *ring_bo; + + /** + * Allocated iris_gen_indirect_params + */ + struct iris_state_ref params; + + /** + * Vertices used to dispatch the generated fragment shaders + */ + struct iris_state_ref vertices; + } generation; } draw; struct { @@ -930,6 +952,60 @@ struct iris_context { } state; }; +/** + * Push constant data handed over to the indirect draw generation shader + */ +struct iris_gen_indirect_params { + /** + * Address of iris_context:draw:generation:ring_bo + */ + uint64_t generated_cmds_addr; + /** + * Address of indirect data to draw with + */ + uint64_t indirect_data_addr; + /** + * Address inside iris_context:draw:generation:ring_bo where to draw ids + */ + uint64_t draw_id_addr; + /** + * Address of the indirect count (can be null, in which case max_draw_count + * is used) + */ + uint64_t draw_count_addr; + /** + * Address to jump to in order to generate more draws + */ + uint64_t gen_addr; + /** + * Address to jump to to end generated draws + */ + uint64_t end_addr; + /** + * Stride between the indirect draw data + */ + uint32_t indirect_data_stride; + /** + * Base index of the current generated draws in the ring buffer (increments + * by ring_count) + */ + uint32_t draw_base; + /** + * Maximum number of generated draw if draw_count_addr is null + */ + uint32_t max_draw_count; + /** + * bits 0-7: ANV_GENERATED_FLAG_* + * bits 8-15: vertex buffer mocs + * bits 16-23: stride between generated commands + */ + uint32_t flags; + /** + * Number of items to generate in the ring buffer + */ + uint32_t ring_count; +}; + #define perf_debug(dbg, ...) do { \ if (INTEL_DEBUG(DEBUG_PERF)) \ dbg_printf(__VA_ARGS__); \ @@ -1134,6 +1210,9 @@ bool iris_blorp_upload_shader(struct blorp_batch *blorp_batch, uint32_t stage, uint32_t *kernel_out, void *prog_data_out); +void iris_ensure_indirect_generation_shader(struct iris_batch *batch); + + /* iris_resolve.c */ void iris_predraw_resolve_inputs(struct iris_context *ice, diff --git a/src/gallium/drivers/iris/iris_draw.c b/src/gallium/drivers/iris/iris_draw.c index 597a18c5c0e..c59ce07132a 100644 --- a/src/gallium/drivers/iris/iris_draw.c +++ b/src/gallium/drivers/iris/iris_draw.c @@ -196,6 +196,14 @@ iris_simple_draw_vbo(struct iris_context *ice, batch->screen->vtbl.upload_render_state(ice, batch, draw, drawid_offset, indirect, sc); } +static inline bool +iris_use_draw_indirect_generation(const struct iris_screen *screen, + const struct pipe_draw_indirect_info *dindirect) +{ + return dindirect != NULL && + dindirect->draw_count >= screen->driconf.generated_indirect_threshold; +} + static void iris_indirect_draw_vbo(struct iris_context *ice, const struct pipe_draw_info *dinfo, @@ -204,6 +212,7 @@ iris_indirect_draw_vbo(struct iris_context *ice, const struct pipe_draw_start_count_bias *draw) { struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; + struct iris_screen *screen = batch->screen; struct pipe_draw_info info = *dinfo; struct pipe_draw_indirect_info indirect = *dindirect; const bool use_predicate = @@ -217,7 +226,14 @@ iris_indirect_draw_vbo(struct iris_context *ice, iris_update_draw_parameters(ice, &info, drawid_offset, &indirect, draw); - batch->screen->vtbl.upload_indirect_render_state(ice, &info, &indirect, draw); + screen->vtbl.upload_indirect_render_state(ice, &info, &indirect, draw); + } else if (iris_use_draw_indirect_generation(screen, &indirect)) { + iris_batch_maybe_flush(batch, 1500); + + iris_update_draw_parameters(ice, &info, drawid_offset, &indirect, draw); + + screen->vtbl.upload_indirect_shader_render_state( + ice, &info, &indirect, draw); } else { iris_emit_buffer_barrier_for(batch, iris_resource_bo(indirect.buffer), IRIS_DOMAIN_VF_READ); @@ -231,7 +247,7 @@ iris_indirect_draw_vbo(struct iris_context *ice, if (use_predicate) { /* Upload MI_PREDICATE_RESULT to GPR15.*/ - batch->screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT); + screen->vtbl.load_register_reg64(batch, CS_GPR(15), MI_PREDICATE_RESULT); } for (int i = 0; i < indirect.draw_count; i++) { @@ -245,7 +261,7 @@ iris_indirect_draw_vbo(struct iris_context *ice, if (use_predicate) { /* Restore MI_PREDICATE_RESULT. */ - batch->screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15)); + screen->vtbl.load_register_reg64(batch, MI_PREDICATE_RESULT, CS_GPR(15)); } } @@ -307,7 +323,19 @@ iris_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info, iris_predraw_flush_buffers(ice, batch, stage); } - iris_binder_reserve_3d(ice); + /* If we're going to use the generation shader, we need to allocate a + * binding table entry for it on <= Gfx9 because that platform does not + * have a null-rendertarget bit in the send message to the render cache, + * the EOT message might pollute later writes to the actual RT of the + * draws. + * + * The generation will call iris_binder_reserve_3d() after the generation + * draw call. + */ + if (iris_use_draw_indirect_generation(screen, indirect) && devinfo->ver <= 9) + iris_binder_reserve_gen(ice); + else + iris_binder_reserve_3d(ice); batch->screen->vtbl.update_binder_address(batch, &ice->state.binder); diff --git a/src/gallium/drivers/iris/iris_genx_macros.h b/src/gallium/drivers/iris/iris_genx_macros.h index 91ab6926fec..ae0e37e1961 100644 --- a/src/gallium/drivers/iris/iris_genx_macros.h +++ b/src/gallium/drivers/iris/iris_genx_macros.h @@ -162,3 +162,10 @@ rw_bo(struct iris_bo *bo, uint64_t offset, enum iris_domain access) return (struct iris_address) { .bo = bo, .offset = offset, .access = access }; } + +UNUSED static struct iris_address +iris_address_add(struct iris_address addr, uint64_t offset) +{ + addr.offset += offset; + return addr; +} diff --git a/src/gallium/drivers/iris/iris_genx_protos.h b/src/gallium/drivers/iris/iris_genx_protos.h index 7a7a136bd04..d5911538598 100644 --- a/src/gallium/drivers/iris/iris_genx_protos.h +++ b/src/gallium/drivers/iris/iris_genx_protos.h @@ -74,3 +74,12 @@ void genX(math_add32_gpr0)(struct iris_context *ice, void genX(math_div32_gpr0)(struct iris_context *ice, struct iris_batch *batch, uint32_t D); + +/* iris_indirect_gen.c */ +void genX(init_screen_gen_state)(struct iris_screen *screen); +struct iris_gen_indirect_params * +genX(emit_indirect_generate)(struct iris_batch *batch, + const struct pipe_draw_info *draw, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc, + struct iris_address *out_params_addr); diff --git a/src/gallium/drivers/iris/iris_indirect_gen.c b/src/gallium/drivers/iris/iris_indirect_gen.c new file mode 100644 index 00000000000..9383f3e9559 --- /dev/null +++ b/src/gallium/drivers/iris/iris_indirect_gen.c @@ -0,0 +1,650 @@ +/* Copyright © 2023 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include +#include + +#ifdef HAVE_VALGRIND +#include +#include +#define VG(x) x +#else +#define VG(x) +#endif + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "util/u_upload_mgr.h" +#include "compiler/nir/nir_builder.h" +#include "compiler/nir/nir_serialize.h" +#include "intel/compiler/brw_compiler.h" +#include "intel/common/intel_aux_map.h" +#include "intel/common/intel_l3_config.h" +#include "intel/common/intel_sample_positions.h" +#include "intel/ds/intel_tracepoints.h" +#include "iris_batch.h" +#include "iris_context.h" +#include "iris_defines.h" +#include "iris_pipe.h" +#include "iris_resource.h" +#include "iris_utrace.h" + +#include "iris_genx_macros.h" +#include "intel/common/intel_genX_state.h" + +#include "drm-uapi/i915_drm.h" + +#include "libintel_shaders.h" + +#if GFX_VERx10 == 80 +# include "intel_gfx8_shaders_code.h" +#elif GFX_VERx10 == 90 +# include "intel_gfx9_shaders_code.h" +#elif GFX_VERx10 == 110 +# include "intel_gfx11_shaders_code.h" +#elif GFX_VERx10 == 120 +# include "intel_gfx12_shaders_code.h" +#elif GFX_VERx10 == 125 +# include "intel_gfx125_shaders_code.h" +#elif GFX_VERx10 == 200 +# include "intel_gfx20_shaders_code.h" +#else +# error "Unsupported generation" +#endif + +#define load_param(b, bit_size, struct_name, field_name) \ + nir_load_uniform(b, 1, bit_size, nir_imm_int(b, 0), \ + .base = offsetof(struct_name, field_name), \ + .range = bit_size / 8) + +static nir_def * +load_fragment_index(nir_builder *b) +{ + nir_def *pos_in = nir_f2i32(b, nir_trim_vector(b, nir_load_frag_coord(b), 2)); + return nir_iadd(b, + nir_imul_imm(b, nir_channel(b, pos_in, 1), 8192), + nir_channel(b, pos_in, 0)); +} + +static nir_shader * +load_shader_lib(struct iris_screen *screen, void *mem_ctx) +{ + const nir_shader_compiler_options *nir_options = + screen->compiler->nir_options[MESA_SHADER_KERNEL]; + + struct blob_reader blob; + blob_reader_init(&blob, (void *)genX(intel_shaders_nir), + sizeof(genX(intel_shaders_nir))); + return nir_deserialize(mem_ctx, nir_options, &blob); +} + +static unsigned +iris_call_generation_shader(struct iris_screen *screen, nir_builder *b) +{ + genX(libiris_write_draw)( + b, + load_param(b, 64, struct iris_gen_indirect_params, generated_cmds_addr), + load_param(b, 64, struct iris_gen_indirect_params, indirect_data_addr), + load_param(b, 64, struct iris_gen_indirect_params, draw_id_addr), + load_param(b, 32, struct iris_gen_indirect_params, indirect_data_stride), + load_param(b, 64, struct iris_gen_indirect_params, draw_count_addr), + load_param(b, 32, struct iris_gen_indirect_params, draw_base), + load_param(b, 32, struct iris_gen_indirect_params, max_draw_count), + load_param(b, 32, struct iris_gen_indirect_params, flags), + load_param(b, 32, struct iris_gen_indirect_params, ring_count), + load_param(b, 64, struct iris_gen_indirect_params, gen_addr), + load_param(b, 64, struct iris_gen_indirect_params, end_addr), + load_fragment_index(b)); + return sizeof(struct iris_gen_indirect_params); +} + +void +genX(init_screen_gen_state)(struct iris_screen *screen) +{ + screen->vtbl.load_shader_lib = load_shader_lib; + screen->vtbl.call_generation_shader = iris_call_generation_shader; +} + +/** + * Stream out temporary/short-lived state. + * + * This allocates space, pins the BO, and includes the BO address in the + * returned offset (which works because all state lives in 32-bit memory + * zones). + */ +static void * +upload_state(struct iris_batch *batch, + struct u_upload_mgr *uploader, + struct iris_state_ref *ref, + unsigned size, + unsigned alignment) +{ + void *p = NULL; + u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p); + iris_use_pinned_bo(batch, iris_resource_bo(ref->res), false, IRIS_DOMAIN_NONE); + return p; +} + +static uint32_t * +stream_state(struct iris_batch *batch, + struct u_upload_mgr *uploader, + struct pipe_resource **out_res, + unsigned size, + unsigned alignment, + uint32_t *out_offset) +{ + void *ptr = NULL; + + u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr); + + struct iris_bo *bo = iris_resource_bo(*out_res); + iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE); + + iris_record_state_size(batch->state_sizes, + bo->address + *out_offset, size); + + *out_offset += iris_bo_offset_from_base_address(bo); + + return ptr; +} + +static void +emit_indirect_generate_draw(struct iris_batch *batch, + struct iris_address params_addr, + unsigned params_size, + unsigned ring_count) +{ + struct iris_screen *screen = batch->screen; + struct iris_context *ice = batch->ice; + struct isl_device *isl_dev = &screen->isl_dev; + const struct intel_device_info *devinfo = screen->devinfo; + + /* State emission */ + uint32_t ves_dws[1 + 2 * GENX(VERTEX_ELEMENT_STATE_length)]; + iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), ves_dws, ve) { + ve.DWordLength = 1 + GENX(VERTEX_ELEMENT_STATE_length) * 2 - + GENX(3DSTATE_VERTEX_ELEMENTS_length_bias); + } + iris_pack_state(GENX(VERTEX_ELEMENT_STATE), &ves_dws[1], ve) { + ve.VertexBufferIndex = 1; + ve.Valid = true; + ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT; + ve.SourceElementOffset = 0; + ve.Component0Control = VFCOMP_STORE_SRC; + ve.Component1Control = VFCOMP_STORE_0; + ve.Component2Control = VFCOMP_STORE_0; + ve.Component3Control = VFCOMP_STORE_0; + } + iris_pack_state(GENX(VERTEX_ELEMENT_STATE), &ves_dws[3], ve) { + ve.VertexBufferIndex = 0; + ve.Valid = true; + ve.SourceElementFormat = ISL_FORMAT_R32G32B32_FLOAT; + ve.SourceElementOffset = 0; + ve.Component0Control = VFCOMP_STORE_SRC; + ve.Component1Control = VFCOMP_STORE_SRC; + ve.Component2Control = VFCOMP_STORE_SRC; + ve.Component3Control = VFCOMP_STORE_1_FP; + } + + iris_batch_emit(batch, ves_dws, sizeof(ves_dws)); + + iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf); + iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgvs) { + sgvs.InstanceIDEnable = true; + sgvs.InstanceIDComponentNumber = COMP_1; + sgvs.InstanceIDElementOffset = 0; + } +#if GFX_VER >= 11 + iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS_2), sgvs); +#endif + iris_emit_cmd(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + vfi.InstancingEnable = false; + vfi.VertexElementIndex = 0; + } + iris_emit_cmd(batch, GENX(3DSTATE_VF_INSTANCING), vfi) { + vfi.InstancingEnable = false; + vfi.VertexElementIndex = 1; + } + + iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) { + topo.PrimitiveTopologyType = _3DPRIM_RECTLIST; + } + + ice->shaders.urb.cfg.size[MESA_SHADER_VERTEX] = 1; + ice->shaders.urb.cfg.size[MESA_SHADER_TESS_CTRL] = 1; + ice->shaders.urb.cfg.size[MESA_SHADER_TESS_EVAL] = 1; + ice->shaders.urb.cfg.size[MESA_SHADER_GEOMETRY] = 1; + genX(emit_urb_config)(batch, + false /* has_tess_eval */, + false /* has_geometry */); + + iris_emit_cmd(batch, GENX(3DSTATE_PS_BLEND), ps_blend) { + ps_blend.HasWriteableRT = true; + } + + iris_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wm); + +#if GFX_VER >= 12 + iris_emit_cmd(batch, GENX(3DSTATE_DEPTH_BOUNDS), db) { + db.DepthBoundsTestEnable = false; + db.DepthBoundsTestMinValue = 0.0; + db.DepthBoundsTestMaxValue = 1.0; + } +#endif + + iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms); + iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), sm) { + sm.SampleMask = 0x1; + } + + iris_emit_cmd(batch, GENX(3DSTATE_VS), vs); + iris_emit_cmd(batch, GENX(3DSTATE_HS), hs); + iris_emit_cmd(batch, GENX(3DSTATE_TE), te); + iris_emit_cmd(batch, GENX(3DSTATE_DS), DS); + + iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), so); + + iris_emit_cmd(batch, GENX(3DSTATE_GS), gs); + + iris_emit_cmd(batch, GENX(3DSTATE_CLIP), clip) { + clip.PerspectiveDivideDisable = true; + } + + iris_emit_cmd(batch, GENX(3DSTATE_SF), sf) { +#if GFX_VER >= 12 + sf.DerefBlockSize = ice->state.urb_deref_block_size; +#endif + } + + iris_emit_cmd(batch, GENX(3DSTATE_RASTER), raster) { + raster.CullMode = CULLMODE_NONE; + } + + const struct brw_wm_prog_data *wm_prog_data = (void *) + ice->draw.generation.shader->prog_data; + + iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) { + sbe.VertexURBEntryReadOffset = 1; + sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; + sbe.VertexURBEntryReadLength = MAX2((wm_prog_data->num_varying_inputs + 1) / 2, 1); + sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs; + sbe.ForceVertexURBEntryReadLength = true; + sbe.ForceVertexURBEntryReadOffset = true; +#if GFX_VER >= 9 + for (unsigned i = 0; i < 32; i++) + sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; +#endif + } + + iris_emit_cmd(batch, GENX(3DSTATE_WM), wm) { + if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill) + wm.ForceThreadDispatchEnable = ForceON; + } + + iris_emit_cmd(batch, GENX(3DSTATE_PS), ps) { + intel_set_ps_dispatch_state(&ps, devinfo, wm_prog_data, + 1 /* rasterization_samples */, + 0 /* msaa_flags */); + + ps.VectorMaskEnable = wm_prog_data->uses_vmask; + + ps.BindingTableEntryCount = GFX_VER == 9 ? 1 : 0; +#if GFX_VER < 20 + ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 || + wm_prog_data->base.ubo_ranges[0].length; +#endif + + ps.DispatchGRFStartRegisterForConstantSetupData0 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0); + ps.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1); +#if GFX_VER < 20 + ps.DispatchGRFStartRegisterForConstantSetupData2 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2); +#endif + + ps.KernelStartPointer0 = KSP(ice->draw.generation.shader) + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0); + ps.KernelStartPointer1 = KSP(ice->draw.generation.shader) + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1); +#if GFX_VER < 20 + ps.KernelStartPointer2 = KSP(ice->draw.generation.shader) + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); +#endif + + ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1; + } + + iris_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) { + psx.PixelShaderValid = true; +#if GFX_VER < 20 + psx.AttributeEnable = wm_prog_data->num_varying_inputs > 0; +#endif + psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch; + psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; +#if GFX_VER >= 9 +#if GFX_VER >= 20 + assert(!wm_prog_data->pulls_bary); +#else + psx.PixelShaderPullsBary = wm_prog_data->pulls_bary; +#endif + psx.PixelShaderComputesStencil = wm_prog_data->computed_stencil; +#endif + psx.PixelShaderHasUAV = GFX_VER == 8; + } + + iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) { + uint32_t cc_vp_address; + uint32_t *cc_vp_map = + stream_state(batch, ice->state.dynamic_uploader, + &ice->state.last_res.cc_vp, + 4 * GENX(CC_VIEWPORT_length), 32, &cc_vp_address); + + iris_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) { + ccv.MinimumDepth = 0.0f; + ccv.MaximumDepth = 1.0f; + } + cc.CCViewportPointer = cc_vp_address; + } + +#if GFX_VER >= 12 + /* Disable Primitive Replication. */ + iris_emit_cmd(batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr); +#endif + +#if GFX_VERx10 == 125 + /* DG2: Wa_22011440098 + * MTL: Wa_18022330953 + * + * In 3D mode, after programming push constant alloc command immediately + * program push constant command(ZERO length) without any commit between + * them. + * + * Note that Wa_16011448509 isn't needed here as all address bits are zero. + */ + iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), c) { + /* Update empty push constants for all stages (bitmask = 11111b) */ + c.ShaderUpdateEnable = 0x1f; + c.MOCS = iris_mocs(NULL, isl_dev, 0); + } +#endif + + float x0 = 0.0f, x1 = MIN2(ring_count, 8192); + float y0 = 0.0f, y1 = DIV_ROUND_UP(ring_count, 8192); + float z = 0.0f; + + float *vertices = + upload_state(batch, ice->state.dynamic_uploader, + &ice->draw.generation.vertices, + ALIGN(9 * sizeof(float), 8), 8); + + vertices[0] = x1; vertices[1] = y1; vertices[2] = z; /* v0 */ + vertices[3] = x0; vertices[4] = y1; vertices[5] = z; /* v1 */ + vertices[6] = x0; vertices[7] = y0; vertices[8] = z; /* v2 */ + + + uint32_t vbs_dws[1 + GENX(VERTEX_BUFFER_STATE_length)]; + iris_pack_command(GENX(3DSTATE_VERTEX_BUFFERS), vbs_dws, vbs) { + vbs.DWordLength = ARRAY_SIZE(vbs_dws) - + GENX(3DSTATE_VERTEX_BUFFERS_length_bias); + } + _iris_pack_state(batch, GENX(VERTEX_BUFFER_STATE), &vbs_dws[1], vb) { + vb.VertexBufferIndex = 0; + vb.AddressModifyEnable = true; + vb.BufferStartingAddress = ro_bo(iris_resource_bo(ice->draw.generation.vertices.res), + ice->draw.generation.vertices.offset); + vb.BufferPitch = 3 * sizeof(float); + vb.BufferSize = 9 * sizeof(float); + vb.MOCS = iris_mocs(NULL, isl_dev, ISL_SURF_USAGE_VERTEX_BUFFER_BIT); +#if GFX_VER >= 12 + vb.L3BypassDisable = true; +#endif + } + iris_batch_emit(batch, vbs_dws, sizeof(vbs_dws)); + +#if GFX_VERx10 > 120 + uint32_t const_dws[GENX(3DSTATE_CONSTANT_ALL_length) + + GENX(3DSTATE_CONSTANT_ALL_DATA_length)]; + + iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), const_dws, all) { + all.DWordLength = ARRAY_SIZE(const_dws) - + GENX(3DSTATE_CONSTANT_ALL_length_bias); + all.ShaderUpdateEnable = 1 << MESA_SHADER_FRAGMENT; + all.MOCS = isl_mocs(isl_dev, 0, false); + all.PointerBufferMask = 0x1; + } + _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA), + &const_dws[GENX(3DSTATE_CONSTANT_ALL_length)], data) { + data.PointerToConstantBuffer = params_addr; + data.ConstantBufferReadLength = DIV_ROUND_UP(params_size, 32); + } + iris_batch_emit(batch, const_dws, sizeof(const_dws)); +#else + /* The Skylake PRM contains the following restriction: + * + * "The driver must ensure The following case does not occur without a + * flush to the 3D engine: 3DSTATE_CONSTANT_* with buffer 3 read length + * equal to zero committed followed by a 3DSTATE_CONSTANT_* with buffer + * 0 read length not equal to zero committed." + * + * To avoid this, we program the highest slot. + */ + iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_PS), c) { +#if GFX_VER > 8 + c.MOCS = iris_mocs(NULL, isl_dev, ISL_SURF_USAGE_CONSTANT_BUFFER_BIT); +#endif + c.ConstantBody.ReadLength[3] = DIV_ROUND_UP(params_size, 32); + c.ConstantBody.Buffer[3] = params_addr; + } +#endif + +#if GFX_VER <= 9 + /* Gfx9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted in + * order to commit constants. TODO: Investigate "Disable Gather at Set + * Shader" to go back to legacy mode... + * + * The null writes of the generation shader also appear to disturb the next + * RT writes, so we choose to reemit the binding table to a null RT on Gfx8 + * too. + */ + struct iris_binder *binder = &ice->state.binder; + iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_PS), ptr) { + ptr.PointertoPSBindingTable = + binder->bt_offset[MESA_SHADER_FRAGMENT] >> IRIS_BT_OFFSET_SHIFT; + } + uint32_t *bt_map = binder->map + binder->bt_offset[MESA_SHADER_FRAGMENT]; + uint32_t surf_base_offset = binder->bo->address; + bt_map[0] = ice->state.null_fb.offset - surf_base_offset; +#endif + + genX(maybe_emit_breakpoint)(batch, true); + + iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) { + prim.VertexAccessType = SEQUENTIAL; + prim.PrimitiveTopologyType = _3DPRIM_RECTLIST; + prim.VertexCountPerInstance = 3; + prim.InstanceCount = 1; + } + + + /* We've smashed all state compared to what the normal 3D pipeline + * rendering tracks for GL. + */ + + uint64_t skip_bits = (IRIS_DIRTY_POLYGON_STIPPLE | + IRIS_DIRTY_SO_BUFFERS | + IRIS_DIRTY_SO_DECL_LIST | + IRIS_DIRTY_LINE_STIPPLE | + IRIS_ALL_DIRTY_FOR_COMPUTE | + IRIS_DIRTY_SCISSOR_RECT | + IRIS_DIRTY_VF); + /* Wa_14016820455 + * On Gfx 12.5 platforms, the SF_CL_VIEWPORT pointer can be invalidated + * likely by a read cache invalidation when clipping is disabled, so we + * don't skip its dirty bit here, in order to reprogram it. + */ + if (GFX_VERx10 != 125) + skip_bits |= IRIS_DIRTY_SF_CL_VIEWPORT; + + uint64_t skip_stage_bits = (IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE | + IRIS_STAGE_DIRTY_UNCOMPILED_VS | + IRIS_STAGE_DIRTY_UNCOMPILED_TCS | + IRIS_STAGE_DIRTY_UNCOMPILED_TES | + IRIS_STAGE_DIRTY_UNCOMPILED_GS | + IRIS_STAGE_DIRTY_UNCOMPILED_FS | + IRIS_STAGE_DIRTY_SAMPLER_STATES_VS | + IRIS_STAGE_DIRTY_SAMPLER_STATES_TCS | + IRIS_STAGE_DIRTY_SAMPLER_STATES_TES | + IRIS_STAGE_DIRTY_SAMPLER_STATES_GS); + + if (!ice->shaders.prog[MESA_SHADER_TESS_EVAL]) { + /* Generation disabled tessellation, but it was already off anyway */ + skip_stage_bits |= IRIS_STAGE_DIRTY_TCS | + IRIS_STAGE_DIRTY_TES | + IRIS_STAGE_DIRTY_CONSTANTS_TCS | + IRIS_STAGE_DIRTY_CONSTANTS_TES | + IRIS_STAGE_DIRTY_BINDINGS_TCS | + IRIS_STAGE_DIRTY_BINDINGS_TES; + } + + if (!ice->shaders.prog[MESA_SHADER_GEOMETRY]) { + /* Generation disabled geometry shaders, but it was already off + * anyway + */ + skip_stage_bits |= IRIS_STAGE_DIRTY_GS | + IRIS_STAGE_DIRTY_CONSTANTS_GS | + IRIS_STAGE_DIRTY_BINDINGS_GS; + } + + ice->state.dirty |= ~skip_bits; + ice->state.stage_dirty |= ~skip_stage_bits; + + for (int i = 0; i < ARRAY_SIZE(ice->shaders.urb.cfg.size); i++) + ice->shaders.urb.cfg.size[i] = 0; + +#if GFX_VER <= 9 + /* Now reupdate the binding tables with the new offsets for the actual + * application shaders. + */ + iris_binder_reserve_3d(ice); + screen->vtbl.update_binder_address(batch, binder); +#endif +} + +#define RING_SIZE (128 * 1024) + +static void +ensure_ring_bo(struct iris_context *ice, struct iris_screen *screen) +{ + struct iris_bufmgr *bufmgr = screen->bufmgr; + + if (ice->draw.generation.ring_bo != NULL) + return; + + ice->draw.generation.ring_bo = + iris_bo_alloc(bufmgr, "gen ring", + RING_SIZE, 8, IRIS_MEMZONE_OTHER, + BO_ALLOC_NO_SUBALLOC); + iris_get_backing_bo(ice->draw.generation.ring_bo)->real.kflags |= EXEC_OBJECT_CAPTURE; + +} + +struct iris_gen_indirect_params * +genX(emit_indirect_generate)(struct iris_batch *batch, + const struct pipe_draw_info *draw, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc, + struct iris_address *out_params_addr) +{ + struct iris_screen *screen = batch->screen; + struct iris_context *ice = batch->ice; + + iris_ensure_indirect_generation_shader(batch); + ensure_ring_bo(ice, screen); + + const size_t struct_stride = draw->index_size > 0 ? + sizeof(uint32_t) * 5 : + sizeof(uint32_t) * 4; + unsigned cmd_stride = 0; + if (ice->state.vs_uses_draw_params || + ice->state.vs_uses_derived_draw_params) { + cmd_stride += 4; /* 3DSTATE_VERTEX_BUFFERS */ + + if (ice->state.vs_uses_draw_params) + cmd_stride += 4 * GENX(VERTEX_BUFFER_STATE_length); + + if (ice->state.vs_uses_derived_draw_params) + cmd_stride += 4 * GENX(VERTEX_BUFFER_STATE_length); + } + cmd_stride += 4 * GENX(3DPRIMITIVE_length); + + const unsigned setup_dws = +#if GFX_VER >= 12 + GENX(MI_ARB_CHECK_length) + +#endif + GENX(MI_BATCH_BUFFER_START_length); + const unsigned ring_count = + (RING_SIZE - 4 * setup_dws) / + (cmd_stride + 4 * 2 /* draw_id, is_indexed_draw */); + + uint32_t params_size = align(sizeof(struct iris_gen_indirect_params), 32); + struct iris_gen_indirect_params *params = + upload_state(batch, ice->ctx.const_uploader, + &ice->draw.generation.params, + params_size, 64); + *out_params_addr = + ro_bo(iris_resource_bo(ice->draw.generation.params.res), + ice->draw.generation.params.offset); + + iris_use_pinned_bo(batch, + iris_resource_bo(indirect->buffer), + false, IRIS_DOMAIN_NONE); + if (indirect->indirect_draw_count) { + iris_use_pinned_bo(batch, + iris_resource_bo(indirect->indirect_draw_count), + false, IRIS_DOMAIN_NONE); + } + iris_use_pinned_bo(batch, ice->draw.generation.ring_bo, + false, IRIS_DOMAIN_NONE); + + *params = (struct iris_gen_indirect_params) { + .generated_cmds_addr = ice->draw.generation.ring_bo->address, + .ring_count = ring_count, + .draw_id_addr = ice->draw.generation.ring_bo->address + + ring_count * cmd_stride + + 4 * GENX(MI_BATCH_BUFFER_START_length), + .draw_count_addr = indirect->indirect_draw_count ? + (iris_resource_bo(indirect->indirect_draw_count)->address + + indirect->indirect_draw_count_offset) : 0, + .indirect_data_addr = iris_resource_bo(indirect->buffer)->address + + indirect->offset, + .indirect_data_stride = indirect->stride == 0 ? + struct_stride : indirect->stride, + .max_draw_count = indirect->draw_count, + .flags = (draw->index_size > 0 ? ANV_GENERATED_FLAG_INDEXED : 0) | + (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT ? + ANV_GENERATED_FLAG_PREDICATED : 0) | + (ice->state.vs_uses_draw_params ? + ANV_GENERATED_FLAG_BASE : 0) | + (ice->state.vs_uses_derived_draw_params ? + ANV_GENERATED_FLAG_DRAWID : 0) | + (iris_mocs(NULL, &screen->isl_dev, + ISL_SURF_USAGE_VERTEX_BUFFER_BIT) << 8) | + ((cmd_stride / 4) << 16) | + util_bitcount64(ice->state.bound_vertex_buffers) << 24, + }; + + genX(maybe_emit_breakpoint)(batch, true); + + emit_indirect_generate_draw(batch, *out_params_addr, params_size, + MIN2(ring_count, indirect->draw_count)); + + genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count); + genX(maybe_emit_breakpoint)(batch, false); + + + return params; +} diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c index ef02586774a..71e91abe58e 100644 --- a/src/gallium/drivers/iris/iris_program_cache.c +++ b/src/gallium/drivers/iris/iris_program_cache.c @@ -39,6 +39,7 @@ #include "compiler/nir/nir.h" #include "compiler/nir/nir_builder.h" #include "intel/compiler/brw_compiler.h" +#include "intel/compiler/brw_nir.h" #include "iris_context.h" #include "iris_resource.h" @@ -290,3 +291,140 @@ iris_destroy_program_cache(struct iris_context *ice) ralloc_free(ice->shaders.cache); } + +static void +link_libintel_shaders(nir_shader *nir, const nir_shader *libintel) +{ + nir_link_shader_functions(nir, libintel); + NIR_PASS_V(nir, nir_inline_functions); + NIR_PASS_V(nir, nir_remove_non_entrypoints); + NIR_PASS_V(nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, + glsl_get_cl_type_size_align); + NIR_PASS_V(nir, nir_opt_deref); + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_lower_explicit_io, + nir_var_shader_temp | nir_var_function_temp | nir_var_mem_shared | + nir_var_mem_global, + nir_address_format_62bit_generic); +} + +void +iris_ensure_indirect_generation_shader(struct iris_batch *batch) +{ + struct iris_context *ice = batch->ice; + if (ice->draw.generation.shader) + return; + + struct iris_screen *screen = (struct iris_screen *)ice->ctx.screen; + const struct { + char name[40]; + } key = { + .name = "iris-generation-shader", + }; + ice->draw.generation.shader = + iris_find_cached_shader(ice, IRIS_CACHE_BLORP, sizeof(key), &key); + if (ice->draw.generation.shader != NULL) + return; + + struct brw_compiler *compiler = screen->compiler; + const nir_shader_compiler_options *nir_options = + compiler->nir_options[MESA_SHADER_FRAGMENT]; + + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, + nir_options, + "iris-indirect-generate"); + + uint32_t uniform_size = + screen->vtbl.call_generation_shader(screen, &b); + + nir_shader *nir = b.shader; + + void *mem_ctx = ralloc_context(NULL); + link_libintel_shaders(nir, screen->vtbl.load_shader_lib(screen, mem_ctx)); + + NIR_PASS_V(nir, nir_lower_vars_to_ssa); + NIR_PASS_V(nir, nir_opt_cse); + NIR_PASS_V(nir, nir_opt_gcm, true); + NIR_PASS_V(nir, nir_opt_peephole_select, 1, false, false); + + NIR_PASS_V(nir, nir_lower_variable_initializers, ~0); + + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_split_per_member_structs); + + struct brw_nir_compiler_opts opts = {}; + brw_preprocess_nir(compiler, nir, &opts); + + NIR_PASS_V(nir, nir_propagate_invariant, false); + + NIR_PASS_V(nir, nir_lower_input_attachments, + &(nir_input_attachment_options) { + .use_fragcoord_sysval = true, + .use_layer_id_sysval = true, + }); + + /* Reset sizes before gathering information */ + nir->global_mem_size = 0; + nir->scratch_size = 0; + nir->info.shared_size = 0; + nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir)); + + NIR_PASS_V(nir, nir_copy_prop); + NIR_PASS_V(nir, nir_opt_constant_folding); + NIR_PASS_V(nir, nir_opt_dce); + + /* Do vectorizing here. For some reason when trying to do it in the back + * this just isn't working. + */ + nir_load_store_vectorize_options options = { + .modes = nir_var_mem_ubo | nir_var_mem_ssbo | nir_var_mem_global, + .callback = brw_nir_should_vectorize_mem, + .robust_modes = (nir_variable_mode)0, + }; + NIR_PASS_V(nir, nir_opt_load_store_vectorize, &options); + + nir->num_uniforms = uniform_size; + + union brw_any_prog_key prog_key; + memset(&prog_key, 0, sizeof(prog_key)); + + struct brw_wm_prog_data *prog_data = ralloc_size(NULL, sizeof(*prog_data)); + memset(prog_data, 0, sizeof(*prog_data)); + prog_data->base.nr_params = nir->num_uniforms / 4; + + brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->base.ubo_ranges); + + struct brw_compile_stats stats[3]; + struct brw_compile_fs_params params = { + .base = { + .nir = nir, + .log_data = &ice->dbg, + .debug_flag = DEBUG_WM, + .stats = stats, + .mem_ctx = mem_ctx, + }, + .key = &prog_key.wm, + .prog_data = prog_data, + }; + const unsigned *program = brw_compile_fs(compiler, ¶ms); + + struct iris_binding_table bt; + memset(&bt, 0, sizeof(bt)); + + struct iris_compiled_shader *shader = + iris_create_shader_variant(screen, ice->shaders.cache, + IRIS_CACHE_BLORP, + sizeof(key), &key); + iris_finalize_program(shader, &prog_data->base, NULL, NULL, 0, 0, 0, &bt); + + iris_upload_shader(screen, NULL, shader, ice->shaders.cache, + ice->shaders.uploader_driver, + IRIS_CACHE_BLORP, sizeof(key), &key, program); + + ralloc_free(mem_ctx); + + struct iris_bo *bo = iris_resource_bo(shader->assembly.res); + iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE); + + ice->draw.generation.shader = shader; +} diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c index 334fbee39c2..19a39bc0d47 100644 --- a/src/gallium/drivers/iris/iris_screen.c +++ b/src/gallium/drivers/iris/iris_screen.c @@ -891,6 +891,8 @@ iris_screen_create(int fd, const struct pipe_screen_config *config) driQueryOptionb(config->options, "intel_enable_wa_14018912822"); screen->driconf.enable_tbimr = driQueryOptionb(config->options, "intel_tbimr"); + screen->driconf.generated_indirect_threshold = + driQueryOptioni(config->options, "generated_indirect_threshold"); screen->precompile = debug_get_bool_option("shader_precompile", true); @@ -941,6 +943,7 @@ iris_screen_create(int fd, const struct pipe_screen_config *config) iris_init_screen_program_functions(pscreen); genX_call(screen->devinfo, init_screen_state, screen); + genX_call(screen->devinfo, init_screen_gen_state, screen); glsl_type_singleton_init_or_ref(); diff --git a/src/gallium/drivers/iris/iris_screen.h b/src/gallium/drivers/iris/iris_screen.h index a6c4b5d0c3d..3e234dcc68b 100644 --- a/src/gallium/drivers/iris/iris_screen.h +++ b/src/gallium/drivers/iris/iris_screen.h @@ -45,6 +45,9 @@ struct iris_fs_prog_key; struct iris_cs_prog_key; enum iris_program_cache_id; +typedef struct nir_builder nir_builder; +typedef struct nir_shader nir_shader; + struct u_trace; #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) @@ -74,6 +77,10 @@ struct iris_vtable { const struct pipe_draw_info *draw, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count_bias *sc); + void (*upload_indirect_shader_render_state)(struct iris_context *ice, + const struct pipe_draw_info *draw, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc); void (*update_binder_address)(struct iris_batch *batch, struct iris_binder *binder); void (*upload_compute_state)(struct iris_context *ice, @@ -151,6 +158,9 @@ struct iris_vtable { struct iris_cs_prog_key *key); void (*lost_genx_state)(struct iris_context *ice, struct iris_batch *batch); void (*disable_rhwo_optimization)(struct iris_batch *batch, bool disable); + + nir_shader *(*load_shader_lib)(struct iris_screen *screen, void *mem_ctx); + unsigned (*call_generation_shader)(struct iris_screen *screen, nir_builder *b); }; struct iris_address { @@ -195,6 +205,7 @@ struct iris_screen { float lower_depth_range_rate; bool intel_enable_wa_14018912822; bool enable_tbimr; + unsigned generated_indirect_threshold; } driconf; /** Does the kernel support various features (KERNEL_HAS_* bitfield)? */ diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 3efb9275b61..1f7fb5fcd13 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -8536,6 +8536,189 @@ iris_upload_indirect_render_state(struct iris_context *ice, #endif /* GFX_VERx10 >= 125 */ } +static void +iris_upload_indirect_shader_render_state(struct iris_context *ice, + const struct pipe_draw_info *draw, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *sc) +{ + assert(indirect); + + struct iris_batch *batch = &ice->batches[IRIS_BATCH_RENDER]; + UNUSED struct iris_screen *screen = batch->screen; + UNUSED const struct intel_device_info *devinfo = screen->devinfo; + + if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES) + flush_vbos(ice, batch); + + iris_batch_sync_region_start(batch); + + /* Always pin the binder. If we're emitting new binding table pointers, + * we need it. If not, we're probably inheriting old tables via the + * context, and need it anyway. Since true zero-bindings cases are + * practically non-existent, just pin it and avoid last_res tracking. + */ + iris_use_pinned_bo(batch, ice->state.binder.bo, false, + IRIS_DOMAIN_NONE); + + if (!batch->contains_draw) { + if (GFX_VER == 12) { + /* Re-emit constants when starting a new batch buffer in order to + * work around push constant corruption on context switch. + * + * XXX - Provide hardware spec quotation when available. + */ + ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS | + IRIS_STAGE_DIRTY_CONSTANTS_TCS | + IRIS_STAGE_DIRTY_CONSTANTS_TES | + IRIS_STAGE_DIRTY_CONSTANTS_GS | + IRIS_STAGE_DIRTY_CONSTANTS_FS); + } + batch->contains_draw = true; + } + + if (!batch->contains_draw_with_next_seqno) { + iris_restore_render_saved_bos(ice, batch, draw); + batch->contains_draw_with_next_seqno = true; + } + + if (draw->index_size > 0) + iris_emit_index_buffer(ice, batch, draw, sc); + + /* Make sure we have enough space to keep all the commands in the single BO + * (because of the jumps) + */ + iris_require_command_space(batch, 2000); + +#ifndef NDEBUG + struct iris_bo *command_bo = batch->bo; +#endif + + /* Jump point to generate more draw if we run out of space in the ring + * buffer. + */ + uint64_t gen_addr = iris_batch_current_address_u64(batch); + + iris_handle_always_flush_cache(batch); + +#if GFX_VER == 9 + iris_emit_pipe_control_flush(batch, "before generation", + PIPE_CONTROL_VF_CACHE_INVALIDATE); +#endif + + struct iris_address params_addr; + struct iris_gen_indirect_params *params = + genX(emit_indirect_generate)(batch, draw, indirect, sc, + ¶ms_addr); + + iris_emit_pipe_control_flush(batch, "after generation flush", + ((ice->state.vs_uses_draw_params || + ice->state.vs_uses_derived_draw_params) ? + PIPE_CONTROL_VF_CACHE_INVALIDATE : 0) | + PIPE_CONTROL_STALL_AT_SCOREBOARD | + PIPE_CONTROL_DATA_CACHE_FLUSH | + PIPE_CONTROL_CS_STALL); + + trace_intel_begin_draw(&batch->trace); + + /* Always pin the binder. If we're emitting new binding table pointers, + * we need it. If not, we're probably inheriting old tables via the + * context, and need it anyway. Since true zero-bindings cases are + * practically non-existent, just pin it and avoid last_res tracking. + */ + iris_use_pinned_bo(batch, ice->state.binder.bo, false, + IRIS_DOMAIN_NONE); + + /* Wa_1306463417 - Send HS state for every primitive on gfx11. + * Wa_16011107343 (same for gfx12) + * We implement this by setting TCS dirty on each draw. + */ + if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) && + ice->shaders.prog[MESA_SHADER_TESS_CTRL]) { + ice->state.stage_dirty |= IRIS_STAGE_DIRTY_TCS; + } + + iris_upload_dirty_render_state(ice, batch, draw, true); + + iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc); + + genX(maybe_emit_breakpoint)(batch, true); + +#if GFX_VER >= 12 + iris_emit_cmd(batch, GENX(MI_ARB_CHECK), arb) { + arb.PreParserDisableMask = true; + arb.PreParserDisable = true; + } +#endif + + iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) { + bbs.AddressSpaceIndicator = ASI_PPGTT; + bbs.BatchBufferStartAddress = (struct iris_address) { + .bo = ice->draw.generation.ring_bo, + }; + } + + /* Run the ring buffer one more time with the next set of commands */ + uint64_t inc_addr = iris_batch_current_address_u64(batch); + { + iris_emit_pipe_control_flush(batch, + "post generated draws wait", + PIPE_CONTROL_STALL_AT_SCOREBOARD | + PIPE_CONTROL_CS_STALL); + + struct mi_builder b; + mi_builder_init(&b, batch->screen->devinfo, batch); + + struct iris_address draw_base_addr = iris_address_add( + params_addr, + offsetof(struct iris_gen_indirect_params, draw_base)); + + const uint32_t mocs = + iris_mocs(draw_base_addr.bo, &screen->isl_dev, 0); + mi_builder_set_mocs(&b, mocs); + + mi_store(&b, mi_mem32(draw_base_addr), + mi_iadd(&b, mi_mem32(draw_base_addr), + mi_imm(params->ring_count))); + + iris_emit_pipe_control_flush(batch, + "post generation base increment", + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_CONST_CACHE_INVALIDATE); + + iris_emit_cmd(batch, GENX(MI_BATCH_BUFFER_START), bbs) { + bbs.AddressSpaceIndicator = ASI_PPGTT; + bbs.BatchBufferStartAddress = (struct iris_address) { + .offset = gen_addr, + }; + } + } + + /* Exit of the ring buffer */ + uint64_t end_addr = iris_batch_current_address_u64(batch); + +#ifndef NDEBUG + assert(command_bo == batch->bo); +#endif + + genX(emit_3dprimitive_was)(batch, indirect, ice->state.prim_mode, sc->count); + genX(maybe_emit_breakpoint)(batch, false); + + iris_emit_pipe_control_flush(batch, + "post generated draws wait", + PIPE_CONTROL_STALL_AT_SCOREBOARD | + PIPE_CONTROL_CS_STALL); + + params->gen_addr = inc_addr; + params->end_addr = end_addr; + + iris_batch_sync_region_end(batch); + + uint32_t count = (sc) ? sc->count : 0; + count *= draw->instance_count ? draw->instance_count : 1; + trace_intel_end_draw(&batch->trace, count); +} + static void iris_load_indirect_location(struct iris_context *ice, struct iris_batch *batch, @@ -8916,6 +9099,8 @@ iris_destroy_state(struct iris_context *ice) pipe_resource_reference(&ice->draw.draw_params.res, NULL); pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL); + pipe_resource_reference(&ice->draw.generation.params.res, NULL); + pipe_resource_reference(&ice->draw.generation.vertices.res, NULL); /* Loop over all VBOs, including ones for draw parameters */ for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) { @@ -9974,6 +10159,7 @@ genX(init_screen_state)(struct iris_screen *screen) screen->vtbl.init_copy_context = iris_init_copy_context; screen->vtbl.upload_render_state = iris_upload_render_state; screen->vtbl.upload_indirect_render_state = iris_upload_indirect_render_state; + screen->vtbl.upload_indirect_shader_render_state = iris_upload_indirect_shader_render_state; screen->vtbl.update_binder_address = iris_update_binder_address; screen->vtbl.upload_compute_state = iris_upload_compute_state; screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control; diff --git a/src/gallium/drivers/iris/meson.build b/src/gallium/drivers/iris/meson.build index 0bbe45125ba..0ebed526c13 100644 --- a/src/gallium/drivers/iris/meson.build +++ b/src/gallium/drivers/iris/meson.build @@ -74,7 +74,7 @@ iris_per_hw_ver_libs = [] foreach v : ['80', '90', '110', '120', '125', '200'] iris_per_hw_ver_libs += static_library( 'iris_per_hw_ver@0@'.format(v), - ['iris_blorp.c', 'iris_query.c', 'iris_state.c', gen_xml_pack], + ['iris_blorp.c', 'iris_query.c', 'iris_state.c', 'iris_indirect_gen.c', gen_xml_pack], include_directories : [inc_include, inc_src, inc_mapi, inc_mesa, inc_gallium, inc_gallium_aux, inc_intel], c_args : [ no_override_init_args, sse2_args, @@ -82,7 +82,7 @@ foreach v : ['80', '90', '110', '120', '125', '200'] ], gnu_symbol_visibility : 'hidden', dependencies : [dep_libdrm, dep_valgrind, idep_genxml, idep_nir_headers, - idep_intel_driver_ds_headers, ], + idep_intel_driver_ds_headers, idep_intel_shaders, ], ) endforeach