diff --git a/src/gallium/frontends/lavapipe/lvp_execute.c b/src/gallium/frontends/lavapipe/lvp_execute.c index 086347ad36c..3bad295199d 100644 --- a/src/gallium/frontends/lavapipe/lvp_execute.c +++ b/src/gallium/frontends/lavapipe/lvp_execute.c @@ -83,6 +83,7 @@ struct rendering_state { bool constbuf_dirty[PIPE_SHADER_TYPES]; bool pcbuf_dirty[PIPE_SHADER_TYPES]; bool has_pcbuf[PIPE_SHADER_TYPES]; + bool inlines_dirty[PIPE_SHADER_TYPES]; bool vp_dirty; bool scissor_dirty; bool ib_dirty; @@ -177,6 +178,8 @@ struct rendering_state { uint32_t num_so_targets; struct pipe_stream_output_target *so_targets[PIPE_MAX_SO_BUFFERS]; uint32_t so_offsets[PIPE_MAX_SO_BUFFERS]; + + struct lvp_pipeline *pipeline[2]; }; ALWAYS_INLINE static void @@ -256,6 +259,95 @@ update_pcbuf(struct rendering_state *state, enum pipe_shader_type pstage) state->pcbuf_dirty[pstage] = false; } +static void +update_inline_shader_state(struct rendering_state *state, enum pipe_shader_type sh, bool pcbuf_dirty, bool constbuf_dirty) +{ + bool is_compute = sh == PIPE_SHADER_COMPUTE; + uint32_t inline_uniforms[MAX_INLINABLE_UNIFORMS]; + unsigned stage = tgsi_processor_to_shader_stage(sh); + state->inlines_dirty[sh] = false; + if (!state->pipeline[is_compute]->inlines[stage].can_inline) + return; + struct lvp_pipeline *pipeline = state->pipeline[is_compute]; + /* these buffers have already been flushed in llvmpipe, so they're safe to read */ + nir_shader *nir = nir_shader_clone(pipeline->pipeline_nir[stage], pipeline->pipeline_nir[stage]); + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + unsigned ssa_alloc = impl->ssa_alloc; + unsigned count = pipeline->inlines[stage].count[0]; + if (count && pcbuf_dirty) { + unsigned push_size = get_pcbuf_size(state, sh); + for (unsigned i = 0; i < count; i++) { + unsigned offset = pipeline->inlines[stage].uniform_offsets[0][i]; + if (offset < push_size) { + memcpy(&inline_uniforms[i], &state->push_constants[offset], sizeof(uint32_t)); + } else { + for (unsigned i = 0; i < state->uniform_blocks[sh].count; i++) { + if (offset < push_size + state->uniform_blocks[sh].size[i]) { + unsigned ubo_offset = offset - push_size; + uint8_t *block = state->uniform_blocks[sh].block[i]; + memcpy(&inline_uniforms[i], &block[ubo_offset], sizeof(uint32_t)); + break; + } + push_size += state->uniform_blocks[sh].size[i]; + } + } + } + NIR_PASS_V(nir, lvp_inline_uniforms, pipeline, inline_uniforms, 0); + } + if (constbuf_dirty) { + struct pipe_box box = {0}; + u_foreach_bit(slot, pipeline->inlines[stage].can_inline) { + unsigned count = pipeline->inlines[stage].count[slot]; + struct pipe_constant_buffer *cbuf = &state->const_buffer[sh][slot - 1]; + struct pipe_resource *pres = cbuf->buffer; + box.x = cbuf->buffer_offset; + box.width = cbuf->buffer_size - cbuf->buffer_offset; + struct pipe_transfer *xfer; + uint8_t *map = state->pctx->buffer_map(state->pctx, pres, 0, PIPE_MAP_READ, &box, &xfer); + for (unsigned i = 0; i < count; i++) { + unsigned offset = pipeline->inlines[stage].uniform_offsets[slot][i]; + memcpy(&inline_uniforms[i], map + offset, sizeof(uint32_t)); + } + state->pctx->buffer_unmap(state->pctx, xfer); + NIR_PASS_V(nir, lvp_inline_uniforms, pipeline, inline_uniforms, slot); + } + } + lvp_shader_optimize(nir); + impl = nir_shader_get_entrypoint(nir); + void *shader_state; + if (ssa_alloc - impl->ssa_alloc < ssa_alloc / 2 && + !pipeline->inlines[stage].must_inline) { + /* not enough change; don't inline further */ + pipeline->inlines[stage].can_inline = 0; + ralloc_free(nir); + pipeline->shader_cso[sh] = lvp_pipeline_compile(pipeline, nir_shader_clone(NULL, pipeline->pipeline_nir[stage])); + shader_state = pipeline->shader_cso[sh]; + } else { + shader_state = lvp_pipeline_compile(pipeline, nir); + } + switch (sh) { + case PIPE_SHADER_VERTEX: + state->pctx->bind_vs_state(state->pctx, shader_state); + break; + case PIPE_SHADER_TESS_CTRL: + state->pctx->bind_tcs_state(state->pctx, shader_state); + break; + case PIPE_SHADER_TESS_EVAL: + state->pctx->bind_tes_state(state->pctx, shader_state); + break; + case PIPE_SHADER_GEOMETRY: + state->pctx->bind_gs_state(state->pctx, shader_state); + break; + case PIPE_SHADER_FRAGMENT: + state->pctx->bind_fs_state(state->pctx, shader_state); + break; + case PIPE_SHADER_COMPUTE: + state->pctx->bind_compute_state(state->pctx, shader_state); + break; + default: break; + } +} + static void emit_compute_state(struct rendering_state *state) { if (state->iv_dirty[PIPE_SHADER_COMPUTE]) { @@ -265,9 +357,11 @@ static void emit_compute_state(struct rendering_state *state) state->iv_dirty[PIPE_SHADER_COMPUTE] = false; } + bool pcbuf_dirty = state->pcbuf_dirty[PIPE_SHADER_COMPUTE]; if (state->pcbuf_dirty[PIPE_SHADER_COMPUTE]) update_pcbuf(state, PIPE_SHADER_COMPUTE); + bool constbuf_dirty = state->constbuf_dirty[PIPE_SHADER_COMPUTE]; if (state->constbuf_dirty[PIPE_SHADER_COMPUTE]) { for (unsigned i = 0; i < state->num_const_bufs[PIPE_SHADER_COMPUTE]; i++) state->pctx->set_constant_buffer(state->pctx, PIPE_SHADER_COMPUTE, @@ -275,6 +369,9 @@ static void emit_compute_state(struct rendering_state *state) state->constbuf_dirty[PIPE_SHADER_COMPUTE] = false; } + if (state->inlines_dirty[PIPE_SHADER_COMPUTE]) + update_inline_shader_state(state, PIPE_SHADER_COMPUTE, pcbuf_dirty, constbuf_dirty); + if (state->sb_dirty[PIPE_SHADER_COMPUTE]) { state->pctx->set_shader_buffers(state->pctx, PIPE_SHADER_COMPUTE, 0, state->num_shader_buffers[PIPE_SHADER_COMPUTE], @@ -379,9 +476,11 @@ static void emit_state(struct rendering_state *state) cso_set_vertex_elements(state->cso, &state->velem); state->ve_dirty = false; } - + bool constbuf_dirty[PIPE_SHADER_TYPES] = {false}; + bool pcbuf_dirty[PIPE_SHADER_TYPES] = {false}; for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) { + constbuf_dirty[sh] = state->constbuf_dirty[sh]; if (state->constbuf_dirty[sh]) { for (unsigned idx = 0; idx < state->num_const_bufs[sh]; idx++) state->pctx->set_constant_buffer(state->pctx, sh, @@ -391,10 +490,16 @@ static void emit_state(struct rendering_state *state) } for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) { + pcbuf_dirty[sh] = state->pcbuf_dirty[sh]; if (state->pcbuf_dirty[sh]) update_pcbuf(state, sh); } + for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) { + if (state->inlines_dirty[sh]) + update_inline_shader_state(state, sh, pcbuf_dirty[sh], constbuf_dirty[sh]); + } + for (sh = 0; sh < PIPE_SHADER_COMPUTE; sh++) { if (state->sb_dirty[sh]) { state->pctx->set_shader_buffers(state->pctx, sh, @@ -462,7 +567,9 @@ static void handle_compute_pipeline(struct vk_cmd_queue_entry *cmd, state->dispatch_info.block[0] = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[0]; state->dispatch_info.block[1] = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[1]; state->dispatch_info.block[2] = pipeline->pipeline_nir[MESA_SHADER_COMPUTE]->info.workgroup_size[2]; - state->pctx->bind_compute_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_COMPUTE]); + state->inlines_dirty[PIPE_SHADER_COMPUTE] = pipeline->inlines[MESA_SHADER_COMPUTE].can_inline; + if (!pipeline->inlines[MESA_SHADER_COMPUTE].can_inline) + state->pctx->bind_compute_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_COMPUTE]); } static void @@ -620,24 +727,34 @@ static void handle_graphics_pipeline(struct vk_cmd_queue_entry *cmd, const VkPipelineShaderStageCreateInfo *sh = &pipeline->graphics_create_info.pStages[i]; switch (sh->stage) { case VK_SHADER_STAGE_FRAGMENT_BIT: - state->pctx->bind_fs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_FRAGMENT]); + state->inlines_dirty[PIPE_SHADER_FRAGMENT] = pipeline->inlines[MESA_SHADER_FRAGMENT].can_inline; + if (!pipeline->inlines[MESA_SHADER_FRAGMENT].can_inline) + state->pctx->bind_fs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_FRAGMENT]); has_stage[PIPE_SHADER_FRAGMENT] = true; break; case VK_SHADER_STAGE_VERTEX_BIT: - state->pctx->bind_vs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_VERTEX]); + state->inlines_dirty[PIPE_SHADER_VERTEX] = pipeline->inlines[MESA_SHADER_VERTEX].can_inline; + if (!pipeline->inlines[MESA_SHADER_VERTEX].can_inline) + state->pctx->bind_vs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_VERTEX]); has_stage[PIPE_SHADER_VERTEX] = true; break; case VK_SHADER_STAGE_GEOMETRY_BIT: - state->pctx->bind_gs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_GEOMETRY]); + state->inlines_dirty[PIPE_SHADER_GEOMETRY] = pipeline->inlines[MESA_SHADER_GEOMETRY].can_inline; + if (!pipeline->inlines[MESA_SHADER_GEOMETRY].can_inline) + state->pctx->bind_gs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_GEOMETRY]); state->gs_output_lines = pipeline->gs_output_lines ? GS_OUTPUT_LINES : GS_OUTPUT_NOT_LINES; has_stage[PIPE_SHADER_GEOMETRY] = true; break; case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT: - state->pctx->bind_tcs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_CTRL]); + state->inlines_dirty[PIPE_SHADER_TESS_CTRL] = pipeline->inlines[MESA_SHADER_TESS_CTRL].can_inline; + if (!pipeline->inlines[MESA_SHADER_TESS_CTRL].can_inline) + state->pctx->bind_tcs_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_CTRL]); has_stage[PIPE_SHADER_TESS_CTRL] = true; break; case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT: - state->pctx->bind_tes_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_EVAL]); + state->inlines_dirty[PIPE_SHADER_TESS_EVAL] = pipeline->inlines[MESA_SHADER_TESS_EVAL].can_inline; + if (!pipeline->inlines[MESA_SHADER_TESS_EVAL].can_inline) + state->pctx->bind_tes_state(state->pctx, pipeline->shader_cso[PIPE_SHADER_TESS_EVAL]); has_stage[PIPE_SHADER_TESS_EVAL] = true; break; default: @@ -995,6 +1112,7 @@ static void handle_pipeline(struct vk_cmd_queue_entry *cmd, else handle_graphics_pipeline(cmd, state); state->push_size[pipeline->is_compute_pipeline] = pipeline->layout->push_constant_size; + state->pipeline[pipeline->is_compute_pipeline] = pipeline; } static void handle_vertex_buffers2(struct vk_cmd_queue_entry *cmd, @@ -1306,6 +1424,7 @@ static void handle_descriptor(struct rendering_state *state, assert(descriptor->uniform); state->uniform_blocks[p_stage].block[idx] = descriptor->uniform; state->pcbuf_dirty[p_stage] = true; + state->inlines_dirty[p_stage] = true; break; } case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: @@ -1339,6 +1458,7 @@ static void handle_descriptor(struct rendering_state *state, if (state->num_const_bufs[p_stage] <= idx) state->num_const_bufs[p_stage] = idx + 1; state->constbuf_dirty[p_stage] = true; + state->inlines_dirty[p_stage] = true; break; } case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: @@ -2699,6 +2819,12 @@ static void handle_push_constants(struct vk_cmd_queue_entry *cmd, state->pcbuf_dirty[PIPE_SHADER_TESS_CTRL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) > 0; state->pcbuf_dirty[PIPE_SHADER_TESS_EVAL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) > 0; state->pcbuf_dirty[PIPE_SHADER_COMPUTE] |= (stage_flags & VK_SHADER_STAGE_COMPUTE_BIT) > 0; + state->inlines_dirty[PIPE_SHADER_VERTEX] |= (stage_flags & VK_SHADER_STAGE_VERTEX_BIT) > 0; + state->inlines_dirty[PIPE_SHADER_FRAGMENT] |= (stage_flags & VK_SHADER_STAGE_FRAGMENT_BIT) > 0; + state->inlines_dirty[PIPE_SHADER_GEOMETRY] |= (stage_flags & VK_SHADER_STAGE_GEOMETRY_BIT) > 0; + state->inlines_dirty[PIPE_SHADER_TESS_CTRL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) > 0; + state->inlines_dirty[PIPE_SHADER_TESS_EVAL] |= (stage_flags & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) > 0; + state->inlines_dirty[PIPE_SHADER_COMPUTE] |= (stage_flags & VK_SHADER_STAGE_COMPUTE_BIT) > 0; } static void lvp_execute_cmd_buffer(struct lvp_cmd_buffer *cmd_buffer, diff --git a/src/gallium/frontends/lavapipe/lvp_inline_uniforms.c b/src/gallium/frontends/lavapipe/lvp_inline_uniforms.c new file mode 100644 index 00000000000..c133b3db51d --- /dev/null +++ b/src/gallium/frontends/lavapipe/lvp_inline_uniforms.c @@ -0,0 +1,449 @@ +/* + * Copyright © 2020 Advanced Micro Devices, Inc. + * Copyright © 2022 Valve Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/* enhanced version of nir_inline_uniforms that can inline from any uniform buffer + * see nir_inline_uniforms.c for more details + */ + +#include "nir_builder.h" +#include "nir_loop_analyze.h" +#include "lvp_private.h" + +static bool +src_only_uses_uniforms(const nir_src *src, int component, + uint32_t *uni_offsets, uint8_t *num_offsets) +{ + if (!src->is_ssa) + return false; + + assert(component < src->ssa->num_components); + + nir_instr *instr = src->ssa->parent_instr; + + switch (instr->type) { + case nir_instr_type_alu: { + nir_alu_instr *alu = nir_instr_as_alu(instr); + + /* Vector ops only need to check the corresponding component. */ + if (nir_op_is_vec(alu->op)) { + nir_alu_src *alu_src = alu->src + component; + return src_only_uses_uniforms(&alu_src->src, alu_src->swizzle[0], + uni_offsets, num_offsets); + } + + /* Return true if all sources return true. */ + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { + nir_alu_src *alu_src = alu->src + i; + int input_sizes = nir_op_infos[alu->op].input_sizes[i]; + + if (input_sizes == 0) { + /* For ops which has no input size, each component of dest is + * only determined by the same component of srcs. + */ + if (!src_only_uses_uniforms(&alu_src->src, alu_src->swizzle[component], + uni_offsets, num_offsets)) + return false; + } else { + /* For ops which has input size, all components of dest are + * determined by all components of srcs (except vec ops). + */ + for (unsigned j = 0; j < input_sizes; j++) { + if (!src_only_uses_uniforms(&alu_src->src, alu_src->swizzle[j], + uni_offsets, num_offsets)) + return false; + } + } + } + return true; + } + + case nir_instr_type_intrinsic: { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + /* Return true if the intrinsic loads with a constant offset. */ + if (intr->intrinsic == nir_intrinsic_load_ubo && + nir_src_is_const(intr->src[0]) && + nir_src_is_const(intr->src[1]) && + /* TODO: Can't handle other bit sizes for now. */ + intr->dest.ssa.bit_size == 32) { + uint32_t offset = nir_src_as_uint(intr->src[1]) + component * 4; + + /* Already recorded by other one */ + uint32_t ubo = nir_src_as_uint(intr->src[0]); + for (int i = 0; uni_offsets && i < num_offsets[ubo]; i++) { + if (uni_offsets[ubo * PIPE_MAX_CONSTANT_BUFFERS + i] == offset) + return true; + } + + /* Exceed uniform number limit */ + if (num_offsets && num_offsets[ubo] == MAX_INLINABLE_UNIFORMS) + return false; + + /* Record the uniform offset. */ + if (uni_offsets) + uni_offsets[ubo * MAX_INLINABLE_UNIFORMS + num_offsets[ubo]++] = offset; + return true; + } + return false; + } + + case nir_instr_type_load_const: + /* Always return true for constants. */ + return true; + + default: + return false; + } +} + +static bool +is_induction_variable(const nir_src *src, int component, nir_loop_info *info, + uint32_t *uni_offsets, uint8_t *num_offsets) +{ + if (!src->is_ssa) + return false; + + assert(component < src->ssa->num_components); + + /* Return true for induction variable (ie. i in for loop) */ + for (int i = 0; i < info->num_induction_vars; i++) { + nir_loop_induction_variable *var = info->induction_vars + i; + if (var->def == src->ssa) { + /* Induction variable should have constant initial value (ie. i = 0), + * constant update value (ie. i++) and constant end condition + * (ie. i < 10), so that we know the exact loop count for unrolling + * the loop. + * + * Add uniforms need to be inlined for this induction variable's + * initial and update value to be constant, for example: + * + * for (i = init; i < count; i += step) + * + * We collect uniform "init" and "step" here. + */ + if (var->init_src) { + if (!src_only_uses_uniforms(var->init_src, component, + uni_offsets, num_offsets)) + return false; + } + + if (var->update_src) { + nir_alu_src *alu_src = var->update_src; + if (!src_only_uses_uniforms(&alu_src->src, + alu_src->swizzle[component], + uni_offsets, num_offsets)) + return false; + } + + return true; + } + } + + return false; +} + +static void +add_inlinable_uniforms(const nir_src *cond, nir_loop_info *info, + uint32_t *uni_offsets, uint8_t *num_offsets) +{ + uint8_t new_num[PIPE_MAX_CONSTANT_BUFFERS]; + memcpy(new_num, num_offsets, sizeof(new_num)); + /* If condition SSA is always scalar, so component is 0. */ + unsigned component = 0; + + /* Allow induction variable which means a loop terminator. */ + if (info) { + nir_ssa_scalar cond_scalar = {cond->ssa, 0}; + + /* Limit terminator condition to loop unroll support case which is a simple + * comparison (ie. "i < count" is supported, but "i + 1 < count" is not). + */ + if (nir_is_supported_terminator_condition(cond_scalar)) { + nir_alu_instr *alu = nir_instr_as_alu(cond->ssa->parent_instr); + + /* One side of comparison is induction variable, the other side is + * only uniform. + */ + for (int i = 0; i < 2; i++) { + if (is_induction_variable(&alu->src[i].src, alu->src[i].swizzle[0], + info, uni_offsets, new_num)) { + cond = &alu->src[1 - i].src; + component = alu->src[1 - i].swizzle[0]; + break; + } + } + } + } + + /* Only update uniform number when all uniforms in the expression + * can be inlined. Partially inlines uniforms can't lower if/loop. + * + * For example, uniform can be inlined for a shader is limited to 4, + * and we have already added 3 uniforms, then want to deal with + * + * if (uniform0 + uniform1 == 10) + * + * only uniform0 can be inlined due to we exceed the 4 limit. But + * unless both uniform0 and uniform1 are inlined, can we eliminate + * the if statement. + * + * This is even possible when we deal with loop if the induction + * variable init and update also contains uniform like + * + * for (i = uniform0; i < uniform1; i+= uniform2) + * + * unless uniform0, uniform1 and uniform2 can be inlined at once, + * can the loop be unrolled. + */ + if (src_only_uses_uniforms(cond, component, uni_offsets, new_num)) + memcpy(num_offsets, new_num, sizeof(new_num)); +} + +static bool +is_src_uniform_load(nir_src src) +{ + if (nir_src_bit_size(src) != 32 || nir_src_num_components(src) != 1 || nir_src_is_const(src)) + return false; + return src_only_uses_uniforms(&src, 0, NULL, NULL); +} + +static void +process_node(nir_cf_node *node, nir_loop_info *info, + uint32_t *uni_offsets, uint8_t *num_offsets, + struct set *stores) +{ + switch (node->type) { + case nir_cf_node_if: { + nir_if *if_node = nir_cf_node_as_if(node); + const nir_src *cond = &if_node->condition; + add_inlinable_uniforms(cond, info, uni_offsets, num_offsets); + + /* Do not pass loop info down so only alow induction variable + * in loop terminator "if": + * + * for (i = 0; true; i++) + * if (i == count) + * if (i == num) + * + * break + * + * so "num" won't be inlined due to the "if" is not a + * terminator. + */ + info = NULL; + + foreach_list_typed(nir_cf_node, nested_node, node, &if_node->then_list) + process_node(nested_node, info, uni_offsets, num_offsets, stores); + foreach_list_typed(nir_cf_node, nested_node, node, &if_node->else_list) + process_node(nested_node, info, uni_offsets, num_offsets, stores); + break; + } + + case nir_cf_node_loop: { + nir_loop *loop = nir_cf_node_as_loop(node); + + /* Replace loop info, no nested loop info currently: + * + * for (i = 0; i < count0; i++) + * for (j = 0; j < count1; j++) + * if (i == num) + * + * so "num" won't be inlined due to "i" is an induction + * variable of upper loop. + */ + info = loop->info; + + foreach_list_typed(nir_cf_node, nested_node, node, &loop->body) { + bool is_terminator = false; + list_for_each_entry(nir_loop_terminator, terminator, + &info->loop_terminator_list, + loop_terminator_link) { + if (nested_node == &terminator->nif->cf_node) { + is_terminator = true; + break; + } + } + + /* Allow induction variables for terminator "if" only: + * + * for (i = 0; i < count; i++) + * if (i == num) + * + * + * so "num" won't be inlined due to the "if" is not a + * terminator. + */ + nir_loop_info *use_info = is_terminator ? info : NULL; + process_node(nested_node, use_info, uni_offsets, num_offsets, stores); + } + break; + } + + case nir_cf_node_block: { + nir_block *block = nir_cf_node_as_block(node); + nir_foreach_instr(instr, block) { + if (instr->type == nir_instr_type_intrinsic) { + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic == nir_intrinsic_store_deref && is_src_uniform_load(intr->src[1])) + _mesa_set_add(stores, &intr->src[1]); + } + } + break; + } + default: + break; + } +} + +bool +lvp_find_inlinable_uniforms(struct lvp_pipeline *pipeline, nir_shader *shader) +{ + bool ret = false; + struct set *stores = _mesa_set_create(shader, _mesa_hash_pointer, _mesa_key_pointer_equal); + nir_foreach_function(function, shader) { + if (function->impl) { + nir_metadata_require(function->impl, nir_metadata_loop_analysis, nir_var_all); + + foreach_list_typed(nir_cf_node, node, node, &function->impl->body) + process_node(node, NULL, (uint32_t*)pipeline->inlines[shader->info.stage].uniform_offsets, pipeline->inlines[shader->info.stage].count, stores); + } + } + const unsigned threshold = 5; + set_foreach(stores, entry) { + const nir_src *src = entry->key; + unsigned counter = 0; + list_for_each_entry(nir_src, rsrc, &src->ssa->uses, use_link) { + counter++; + if (counter >= threshold) + break; + } + if (counter >= threshold) { + uint8_t new_num[PIPE_MAX_CONSTANT_BUFFERS]; + memcpy(new_num, pipeline->inlines[shader->info.stage].count, sizeof(new_num)); + if (src_only_uses_uniforms(src, 0, (uint32_t*)pipeline->inlines[shader->info.stage].uniform_offsets, new_num)) { + ret = true; + memcpy(pipeline->inlines[shader->info.stage].count, new_num, sizeof(new_num)); + } + } + } + for (unsigned i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { + if (pipeline->inlines[shader->info.stage].count[i]) { + pipeline->inlines[shader->info.stage].can_inline |= BITFIELD_BIT(i); + break; + } + } + return ret; +} + +void +lvp_inline_uniforms(nir_shader *shader, const struct lvp_pipeline *pipeline, const uint32_t *uniform_values, uint32_t ubo) +{ + if (!pipeline->inlines[shader->info.stage].can_inline) + return; + + nir_foreach_function(function, shader) { + if (function->impl) { + nir_builder b; + nir_builder_init(&b, function->impl); + nir_foreach_block(block, function->impl) { + nir_foreach_instr_safe(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + + /* Only replace loads with constant offsets. */ + if (intr->intrinsic == nir_intrinsic_load_ubo && + nir_src_is_const(intr->src[0]) && + nir_src_as_uint(intr->src[0]) == ubo && + nir_src_is_const(intr->src[1]) && + /* TODO: Can't handle other bit sizes for now. */ + intr->dest.ssa.bit_size == 32) { + int num_components = intr->dest.ssa.num_components; + uint32_t offset = nir_src_as_uint(intr->src[1]); + const unsigned num_uniforms = pipeline->inlines[shader->info.stage].count[ubo]; + const unsigned *uniform_dw_offsets = pipeline->inlines[shader->info.stage].uniform_offsets[ubo]; + + if (num_components == 1) { + /* Just replace the uniform load to constant load. */ + for (unsigned i = 0; i < num_uniforms; i++) { + if (offset == uniform_dw_offsets[i]) { + b.cursor = nir_before_instr(&intr->instr); + nir_ssa_def *def = nir_imm_int(&b, uniform_values[i]); + nir_ssa_def_rewrite_uses(&intr->dest.ssa, def); + nir_instr_remove(&intr->instr); + break; + } + } + } else { + /* Lower vector uniform load to scalar and replace each + * found component load with constant load. + */ + uint32_t max_offset = offset + num_components; + nir_ssa_def *components[NIR_MAX_VEC_COMPONENTS] = {0}; + bool found = false; + + b.cursor = nir_before_instr(&intr->instr); + + /* Find component to replace. */ + for (unsigned i = 0; i < num_uniforms; i++) { + uint32_t uni_offset = uniform_dw_offsets[i]; + if (uni_offset >= offset && uni_offset < max_offset) { + int index = uni_offset - offset; + components[index] = nir_imm_int(&b, uniform_values[i]); + found = true; + } + } + + if (!found) + continue; + + /* Create per-component uniform load. */ + for (unsigned i = 0; i < num_components; i++) { + if (!components[i]) { + uint32_t scalar_offset = (offset + i) * 4; + components[i] = nir_load_ubo(&b, 1, intr->dest.ssa.bit_size, + intr->src[0].ssa, + nir_imm_int(&b, scalar_offset)); + nir_intrinsic_instr *load = + nir_instr_as_intrinsic(components[i]->parent_instr); + nir_intrinsic_set_align(load, NIR_ALIGN_MUL_MAX, scalar_offset); + nir_intrinsic_set_range_base(load, scalar_offset); + nir_intrinsic_set_range(load, 4); + } + } + + /* Replace the original uniform load. */ + nir_ssa_def_rewrite_uses(&intr->dest.ssa, + nir_vec(&b, components, num_components)); + nir_instr_remove(&intr->instr); + } + } + } + } + + nir_metadata_preserve(function->impl, nir_metadata_block_index | + nir_metadata_dominance); + } + } +} diff --git a/src/gallium/frontends/lavapipe/lvp_pipeline.c b/src/gallium/frontends/lavapipe/lvp_pipeline.c index d25ed343c2f..798c6b904e5 100644 --- a/src/gallium/frontends/lavapipe/lvp_pipeline.c +++ b/src/gallium/frontends/lavapipe/lvp_pipeline.c @@ -1060,6 +1060,9 @@ lvp_shader_compile_to_ir(struct lvp_pipeline *pipeline, nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage); + nir_function_impl *impl = nir_shader_get_entrypoint(nir); + if (impl->ssa_alloc > 100) //skip for small shaders + pipeline->inlines[stage].must_inline = lvp_find_inlinable_uniforms(pipeline, nir); pipeline->pipeline_nir[stage] = nir; return VK_SUCCESS; @@ -1180,15 +1183,12 @@ lvp_pipeline_compile_stage(struct lvp_pipeline *pipeline, nir_shader *nir) return NULL; } -static VkResult -lvp_pipeline_compile(struct lvp_pipeline *pipeline, - gl_shader_stage stage) +void * +lvp_pipeline_compile(struct lvp_pipeline *pipeline, nir_shader *nir) { struct lvp_device *device = pipeline->device; - device->physical_device->pscreen->finalize_nir(device->physical_device->pscreen, pipeline->pipeline_nir[stage]); - nir_shader *nir = nir_shader_clone(NULL, pipeline->pipeline_nir[stage]); - pipeline->shader_cso[pipe_shader_type_from_mesa(stage)] = lvp_pipeline_compile_stage(pipeline, nir); - return VK_SUCCESS; + device->physical_device->pscreen->finalize_nir(device->physical_device->pscreen, nir); + return lvp_pipeline_compile_stage(pipeline, nir); } #ifndef NDEBUG @@ -1450,7 +1450,10 @@ lvp_graphics_pipeline_init(struct lvp_pipeline *pipeline, const VkPipelineShaderStageCreateInfo *sinfo = &pipeline->graphics_create_info.pStages[i]; gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage); - lvp_pipeline_compile(pipeline, stage); + enum pipe_shader_type pstage = pipe_shader_type_from_mesa(stage); + if (!pipeline->inlines[stage].can_inline) + pipeline->shader_cso[pstage] = lvp_pipeline_compile(pipeline, + nir_shader_clone(NULL, pipeline->pipeline_nir[stage])); if (stage == MESA_SHADER_FRAGMENT) has_fragment_shader = true; } @@ -1571,7 +1574,8 @@ lvp_compute_pipeline_init(struct lvp_pipeline *pipeline, if (result != VK_SUCCESS) return result; - lvp_pipeline_compile(pipeline, MESA_SHADER_COMPUTE); + if (!pipeline->inlines[MESA_SHADER_COMPUTE].can_inline) + pipeline->shader_cso[PIPE_SHADER_COMPUTE] = lvp_pipeline_compile(pipeline, nir_shader_clone(NULL, pipeline->pipeline_nir[MESA_SHADER_COMPUTE])); return VK_SUCCESS; } diff --git a/src/gallium/frontends/lavapipe/lvp_private.h b/src/gallium/frontends/lavapipe/lvp_private.h index ce3f553a604..0ac180c993a 100644 --- a/src/gallium/frontends/lavapipe/lvp_private.h +++ b/src/gallium/frontends/lavapipe/lvp_private.h @@ -434,6 +434,12 @@ struct lvp_pipeline { bool force_min_sample; nir_shader *pipeline_nir[MESA_SHADER_STAGES]; void *shader_cso[PIPE_SHADER_TYPES]; + struct { + uint32_t uniform_offsets[PIPE_MAX_CONSTANT_BUFFERS][MAX_INLINABLE_UNIFORMS]; + uint8_t count[PIPE_MAX_CONSTANT_BUFFERS]; + bool must_inline; + uint32_t can_inline; //bitmask + } inlines[MESA_SHADER_STAGES]; gl_shader_stage last_vertex; struct pipe_stream_output_info stream_output; VkGraphicsPipelineCreateInfo graphics_create_info; @@ -625,6 +631,12 @@ void lvp_shader_optimize(nir_shader *nir); void * lvp_pipeline_compile_stage(struct lvp_pipeline *pipeline, nir_shader *nir); +bool +lvp_find_inlinable_uniforms(struct lvp_pipeline *pipeline, nir_shader *shader); +void +lvp_inline_uniforms(nir_shader *shader, const struct lvp_pipeline *pipeline, const uint32_t *uniform_values, uint32_t ubo); +void * +lvp_pipeline_compile(struct lvp_pipeline *pipeline, nir_shader *base_nir); #ifdef __cplusplus } #endif diff --git a/src/gallium/frontends/lavapipe/meson.build b/src/gallium/frontends/lavapipe/meson.build index a63f3220c31..85ff200fb06 100644 --- a/src/gallium/frontends/lavapipe/meson.build +++ b/src/gallium/frontends/lavapipe/meson.build @@ -18,6 +18,7 @@ liblvp_files = files( 'lvp_util.c', 'lvp_image.c', 'lvp_formats.c', + 'lvp_inline_uniforms.c', 'lvp_lower_vulkan_resource.c', 'lvp_lower_vulkan_resource.h', 'lvp_lower_input_attachments.c',