mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-27 01:50:10 +01:00
Rework it to take all active/enabled shader stages in one shot, to simplify things and drop the xs_configs table. This lets us use the variant reg packers directly to better deal with register changes across generations. Signed-off-by: Rob Clark <rob.clark@oss.qualcomm.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39029>
5063 lines
190 KiB
C++
5063 lines
190 KiB
C++
/*
|
|
* Copyright © 2016 Red Hat.
|
|
* Copyright © 2016 Bas Nieuwenhuizen
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
* based in part on anv driver which is:
|
|
* Copyright © 2015 Intel Corporation
|
|
*/
|
|
|
|
#include "tu_pipeline.h"
|
|
|
|
#include "common/freedreno_guardband.h"
|
|
|
|
#include "ir3/ir3_nir.h"
|
|
#include "nir/nir.h"
|
|
#include "nir/nir_builder.h"
|
|
#include "nir/nir_serialize.h"
|
|
#include "spirv/nir_spirv.h"
|
|
#include "util/u_debug.h"
|
|
#include "util/mesa-sha1.h"
|
|
#include "util/shader_stats.h"
|
|
#include "vk_nir.h"
|
|
#include "vk_pipeline.h"
|
|
#include "vk_render_pass.h"
|
|
#include "vk_util.h"
|
|
|
|
#include "tu_cmd_buffer.h"
|
|
#include "tu_cs.h"
|
|
#include "tu_device.h"
|
|
#include "tu_knl.h"
|
|
#include "tu_formats.h"
|
|
#include "tu_lrz.h"
|
|
#include "tu_pass.h"
|
|
#include "tu_rmv.h"
|
|
|
|
/* Emit IB that preloads the descriptors that the shader uses */
|
|
|
|
static void
|
|
emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
|
|
enum a6xx_state_block sb, unsigned base, unsigned offset,
|
|
unsigned count)
|
|
{
|
|
/* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
|
|
* clear if emitting more packets will even help anything. Presumably the
|
|
* descriptor cache is relatively small, and these packets stop doing
|
|
* anything when there are too many descriptors.
|
|
*/
|
|
tu_cs_emit_pkt7(cs, opcode, 3);
|
|
tu_cs_emit(cs,
|
|
CP_LOAD_STATE6_0_STATE_TYPE(st) |
|
|
CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
|
|
CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
|
|
CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
|
|
tu_cs_emit_qw(cs, offset | (base << 28));
|
|
}
|
|
|
|
static unsigned
|
|
tu6_load_state_size(struct tu_pipeline *pipeline,
|
|
struct tu_pipeline_layout *layout)
|
|
{
|
|
const unsigned load_state_size = 4;
|
|
unsigned size = 0;
|
|
for (unsigned i = 0; i < layout->num_sets; i++) {
|
|
if (!(pipeline->active_desc_sets & (1u << i)))
|
|
continue;
|
|
|
|
struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
|
|
for (unsigned j = 0; j < set_layout->binding_count; j++) {
|
|
struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
|
|
unsigned count = 0;
|
|
/* See comment in tu6_emit_load_state(). */
|
|
VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
|
|
unsigned stage_count = util_bitcount(stages);
|
|
|
|
if (!binding->array_size)
|
|
continue;
|
|
|
|
switch (binding->type) {
|
|
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
|
|
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
|
|
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
|
|
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
|
|
case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
|
|
/* UAV-backed resources only need one packet for all graphics stages */
|
|
if (stage_count)
|
|
count += 1;
|
|
break;
|
|
case VK_DESCRIPTOR_TYPE_SAMPLER:
|
|
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
|
|
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
|
|
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
|
|
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
|
|
/* Textures and UBO's needs a packet for each stage */
|
|
count = stage_count;
|
|
break;
|
|
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
|
|
/* Because of how we pack combined images and samplers, we
|
|
* currently can't use one packet for the whole array.
|
|
*/
|
|
count = stage_count * binding->array_size * 2;
|
|
break;
|
|
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
|
|
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
|
|
case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
|
|
break;
|
|
default:
|
|
UNREACHABLE("bad descriptor type");
|
|
}
|
|
size += count * load_state_size;
|
|
}
|
|
}
|
|
return size;
|
|
}
|
|
|
|
static void
|
|
tu6_emit_load_state(struct tu_device *device,
|
|
struct tu_pipeline *pipeline,
|
|
struct tu_pipeline_layout *layout)
|
|
{
|
|
unsigned size = tu6_load_state_size(pipeline, layout);
|
|
if (size == 0)
|
|
return;
|
|
|
|
struct tu_cs cs;
|
|
tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
|
|
|
|
for (unsigned i = 0; i < layout->num_sets; i++) {
|
|
/* From 13.2.7. Descriptor Set Binding:
|
|
*
|
|
* A compatible descriptor set must be bound for all set numbers that
|
|
* any shaders in a pipeline access, at the time that a draw or
|
|
* dispatch command is recorded to execute using that pipeline.
|
|
* However, if none of the shaders in a pipeline statically use any
|
|
* bindings with a particular set number, then no descriptor set need
|
|
* be bound for that set number, even if the pipeline layout includes
|
|
* a non-trivial descriptor set layout for that set number.
|
|
*
|
|
* This means that descriptor sets unused by the pipeline may have a
|
|
* garbage or 0 BINDLESS_BASE register, which will cause context faults
|
|
* when prefetching descriptors from these sets. Skip prefetching for
|
|
* descriptors from them to avoid this. This is also an optimization,
|
|
* since these prefetches would be useless.
|
|
*/
|
|
if (!(pipeline->active_desc_sets & (1u << i)))
|
|
continue;
|
|
|
|
struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
|
|
for (unsigned j = 0; j < set_layout->binding_count; j++) {
|
|
struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
|
|
unsigned base = i;
|
|
unsigned offset = binding->offset / 4;
|
|
/* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and
|
|
* zink has descriptors for each stage in the push layout even if some
|
|
* stages aren't present in a used pipeline. We don't want to emit
|
|
* loads for unused descriptors.
|
|
*/
|
|
VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
|
|
unsigned count = binding->array_size;
|
|
|
|
/* If this is a variable-count descriptor, then the array_size is an
|
|
* upper bound on the size, but we don't know how many descriptors
|
|
* will actually be used. Therefore we can't pre-load them here.
|
|
*/
|
|
if (j == set_layout->binding_count - 1 &&
|
|
set_layout->has_variable_descriptors)
|
|
continue;
|
|
|
|
if (count == 0 || stages == 0)
|
|
continue;
|
|
switch (binding->type) {
|
|
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
|
|
assert(device->physical_device->reserved_set_idx >= 0);
|
|
base = device->physical_device->reserved_set_idx;
|
|
offset = (pipeline->program.dynamic_descriptor_offsets[i] +
|
|
binding->dynamic_offset_offset) / 4;
|
|
FALLTHROUGH;
|
|
case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
|
|
case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
|
|
case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
|
|
case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
|
|
unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
|
|
/* UAV-backed resources only need one packet for all graphics stages */
|
|
if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
|
|
emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_UAV,
|
|
base, offset, count * mul);
|
|
}
|
|
if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
|
|
emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_UAV, SB6_CS_SHADER,
|
|
base, offset, count * mul);
|
|
}
|
|
break;
|
|
}
|
|
case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
|
|
case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
|
|
case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
|
|
/* nothing - input attachments and inline uniforms don't use bindless */
|
|
break;
|
|
case VK_DESCRIPTOR_TYPE_SAMPLER:
|
|
case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
|
|
case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
|
|
tu_foreach_stage(stage, stages) {
|
|
emit_load_state(&cs, tu6_stage2opcode(stage),
|
|
binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
|
|
ST6_SHADER : ST6_CONSTANTS,
|
|
tu6_stage2texsb(stage), base, offset, count);
|
|
}
|
|
break;
|
|
}
|
|
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
|
|
assert(device->physical_device->reserved_set_idx >= 0);
|
|
base = device->physical_device->reserved_set_idx;
|
|
offset = (pipeline->program.dynamic_descriptor_offsets[i] +
|
|
binding->dynamic_offset_offset) / 4;
|
|
FALLTHROUGH;
|
|
case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
|
|
tu_foreach_stage(stage, stages) {
|
|
emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
|
|
tu6_stage2shadersb(stage), base, offset, count);
|
|
}
|
|
break;
|
|
}
|
|
case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
|
|
tu_foreach_stage(stage, stages) {
|
|
/* TODO: We could emit less CP_LOAD_STATE6 if we used
|
|
* struct-of-arrays instead of array-of-structs.
|
|
*/
|
|
for (unsigned i = 0; i < count; i++) {
|
|
unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
|
|
unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
|
|
emit_load_state(&cs, tu6_stage2opcode(stage),
|
|
ST6_CONSTANTS, tu6_stage2texsb(stage),
|
|
base, tex_offset, 1);
|
|
emit_load_state(&cs, tu6_stage2opcode(stage),
|
|
ST6_SHADER, tu6_stage2texsb(stage),
|
|
base, sam_offset, 1);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
UNREACHABLE("bad descriptor type");
|
|
}
|
|
}
|
|
}
|
|
|
|
pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
|
|
}
|
|
|
|
struct tu_pipeline_builder
|
|
{
|
|
struct tu_device *device;
|
|
void *mem_ctx;
|
|
struct vk_pipeline_cache *cache;
|
|
const VkAllocationCallbacks *alloc;
|
|
const VkGraphicsPipelineCreateInfo *create_info;
|
|
VkPipelineCreateFlags2KHR create_flags;
|
|
|
|
struct tu_pipeline_layout layout;
|
|
|
|
struct tu_pvtmem_config pvtmem;
|
|
|
|
bool rasterizer_discard;
|
|
/* these states are affectd by rasterizer_discard */
|
|
uint8_t unscaled_input_fragcoord;
|
|
|
|
/* Each library defines at least one piece of state in
|
|
* VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so
|
|
* there can be at most as many libraries as pieces of state, of which
|
|
* there are currently 4.
|
|
*/
|
|
#define MAX_LIBRARIES 4
|
|
|
|
unsigned num_libraries;
|
|
struct tu_graphics_lib_pipeline *libraries[MAX_LIBRARIES];
|
|
|
|
/* This is just the state that we are compiling now, whereas the final
|
|
* pipeline will include the state from the libraries.
|
|
*/
|
|
VkGraphicsPipelineLibraryFlagsEXT state;
|
|
|
|
/* The stages we are compiling now. */
|
|
VkShaderStageFlags active_stages;
|
|
|
|
bool fragment_density_map;
|
|
bool fdm_per_layer;
|
|
uint8_t max_fdm_layers;
|
|
|
|
struct vk_graphics_pipeline_all_state all_state;
|
|
struct vk_graphics_pipeline_state graphics_state;
|
|
};
|
|
|
|
static bool
|
|
tu_logic_op_reads_dst(VkLogicOp op)
|
|
{
|
|
switch (op) {
|
|
case VK_LOGIC_OP_CLEAR:
|
|
case VK_LOGIC_OP_COPY:
|
|
case VK_LOGIC_OP_COPY_INVERTED:
|
|
case VK_LOGIC_OP_SET:
|
|
return false;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
static bool
|
|
tu_blend_state_is_dual_src(const struct vk_color_blend_state *cb)
|
|
{
|
|
for (unsigned i = 0; i < cb->attachment_count; i++) {
|
|
if (tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_color_blend_factor) ||
|
|
tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_color_blend_factor) ||
|
|
tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_alpha_blend_factor) ||
|
|
tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_alpha_blend_factor))
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
enum ir3_push_consts_type
|
|
tu_push_consts_type(const struct tu_pipeline_layout *layout,
|
|
const struct ir3_compiler *compiler)
|
|
{
|
|
if (!layout->push_constant_size)
|
|
return IR3_PUSH_CONSTS_NONE;
|
|
|
|
if (TU_DEBUG(PUSH_CONSTS_PER_STAGE))
|
|
return IR3_PUSH_CONSTS_PER_STAGE;
|
|
|
|
if (tu6_shared_constants_enable(layout, compiler)) {
|
|
return IR3_PUSH_CONSTS_SHARED;
|
|
} else {
|
|
if (compiler->gen >= 7) {
|
|
return IR3_PUSH_CONSTS_SHARED_PREAMBLE;
|
|
} else {
|
|
return IR3_PUSH_CONSTS_PER_STAGE;
|
|
}
|
|
}
|
|
}
|
|
|
|
static uint32_t
|
|
sp_xs_config(const struct ir3_shader_variant *v)
|
|
{
|
|
if (!v)
|
|
return 0;
|
|
|
|
return A6XX_SP_VS_CONFIG_ENABLED |
|
|
COND(v->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
|
|
COND(v->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
|
|
COND(v->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_UAV) |
|
|
COND(v->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
|
|
A6XX_SP_VS_CONFIG_NUAV(ir3_shader_num_uavs(v)) |
|
|
A6XX_SP_VS_CONFIG_NTEX(v->num_samp) |
|
|
A6XX_SP_VS_CONFIG_NSAMP(v->num_samp);
|
|
}
|
|
|
|
static bool
|
|
push_shared_consts(const struct ir3_shader_variant *v)
|
|
{
|
|
return v && v->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
void
|
|
tu6_emit_xs_config(struct tu_crb &crb, struct tu_shader_stages stages)
|
|
{
|
|
if (stages.cs) {
|
|
crb.add(SP_CS_CONST_CONFIG(CHIP,
|
|
.constlen = stages.cs->constlen,
|
|
.enabled = true,
|
|
.read_imm_shared_consts = push_shared_consts(stages.cs),
|
|
));
|
|
crb.add(A6XX_SP_CS_CONFIG(.dword = sp_xs_config(stages.cs)));
|
|
} else {
|
|
crb.add(SP_VS_CONST_CONFIG(CHIP,
|
|
.constlen = COND(stages.vs, stages.vs->constlen),
|
|
.enabled = stages.vs,
|
|
.read_imm_shared_consts = push_shared_consts(stages.vs),
|
|
));
|
|
crb.add(SP_HS_CONST_CONFIG(CHIP,
|
|
.constlen = COND(stages.hs, stages.hs->constlen),
|
|
.enabled = stages.hs,
|
|
.read_imm_shared_consts = push_shared_consts(stages.hs),
|
|
));
|
|
crb.add(SP_DS_CONST_CONFIG(CHIP,
|
|
.constlen = COND(stages.ds, stages.ds->constlen),
|
|
.enabled = stages.ds,
|
|
.read_imm_shared_consts = push_shared_consts(stages.ds),
|
|
));
|
|
crb.add(SP_GS_CONST_CONFIG(CHIP,
|
|
.constlen = COND(stages.gs, stages.gs->constlen),
|
|
.enabled = stages.gs,
|
|
.read_imm_shared_consts = push_shared_consts(stages.gs),
|
|
));
|
|
crb.add(SP_PS_CONST_CONFIG(CHIP,
|
|
.constlen = COND(stages.fs, stages.fs->constlen),
|
|
.enabled = stages.fs,
|
|
.read_imm_shared_consts = push_shared_consts(stages.fs),
|
|
));
|
|
|
|
crb.add(A6XX_SP_VS_CONFIG(.dword = sp_xs_config(stages.vs)));
|
|
crb.add(A6XX_SP_HS_CONFIG(.dword = sp_xs_config(stages.hs)));
|
|
crb.add(A6XX_SP_DS_CONFIG(.dword = sp_xs_config(stages.ds)));
|
|
crb.add(A6XX_SP_GS_CONFIG(.dword = sp_xs_config(stages.gs)));
|
|
crb.add(A6XX_SP_PS_CONFIG(.dword = sp_xs_config(stages.fs)));
|
|
}
|
|
}
|
|
TU_GENX(tu6_emit_xs_config);
|
|
|
|
static void
|
|
tu6_emit_dynamic_offset(struct tu_cs *cs,
|
|
const struct ir3_shader_variant *xs,
|
|
const struct tu_shader *shader,
|
|
const struct tu_program_state *program)
|
|
{
|
|
const struct tu_physical_device *phys_dev = cs->device->physical_device;
|
|
|
|
if (!xs)
|
|
return;
|
|
|
|
if (cs->device->physical_device->info->props.load_shader_consts_via_preamble) {
|
|
if (shader->const_state.dynamic_offsets_ubo.size == 0)
|
|
return;
|
|
|
|
uint32_t offsets[MAX_SETS];
|
|
for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
|
|
unsigned dynamic_offset_start =
|
|
program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
|
|
offsets[i] = dynamic_offset_start;
|
|
}
|
|
|
|
/* A7XX TODO: Emit data via sub_cs instead of NOP */
|
|
uint64_t iova = tu_cs_emit_data_nop(cs, offsets, phys_dev->usable_sets, 4);
|
|
uint32_t offset = shader->const_state.dynamic_offsets_ubo.idx;
|
|
|
|
tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 5);
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
|
|
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
|
|
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
|
|
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
|
|
CP_LOAD_STATE6_0_NUM_UNIT(1));
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
|
int size_vec4s = DIV_ROUND_UP(phys_dev->usable_sets, 4);
|
|
tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
|
|
} else {
|
|
if (shader->const_state.dynamic_offset_loc == UINT32_MAX)
|
|
return;
|
|
|
|
tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
|
|
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
|
|
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
|
|
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
|
|
CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
|
|
|
for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
|
|
unsigned dynamic_offset_start =
|
|
program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
|
|
tu_cs_emit(cs, dynamic_offset_start);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <chip CHIP>
|
|
void
|
|
tu6_emit_shared_consts_enable(struct tu_crb &crb, bool enable)
|
|
{
|
|
if (CHIP == A6XX) {
|
|
/* Enable/disable shared constants */
|
|
crb.add(HLSQ_SHARED_CONSTS(CHIP, .enable = enable));
|
|
} else {
|
|
assert(!enable);
|
|
}
|
|
|
|
crb.add(A6XX_SP_MODE_CNTL(.constant_demotion_enable = true,
|
|
.isammode = ISAMMODE_GL,
|
|
.shared_consts_enable = enable));
|
|
}
|
|
TU_GENX(tu6_emit_shared_consts_enable);
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_setup_streamout(struct tu_cs *cs,
|
|
const struct ir3_shader_variant *v,
|
|
const struct ir3_shader_linkage *l)
|
|
{
|
|
const struct ir3_stream_output_info *info = &v->stream_output;
|
|
/* Note: 64 here comes from the HW layout of the program RAM. The program
|
|
* for stream N is at DWORD 64 * N.
|
|
*/
|
|
#define A6XX_SO_PROG_DWORDS 64
|
|
uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
|
|
BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
|
|
bool has_pc_dgen_so_cntl = cs->device->physical_device->info->props.has_pc_dgen_so_cntl;
|
|
|
|
/* TODO: streamout state should be in a non-GMEM draw state */
|
|
|
|
/* no streamout: */
|
|
if (info->num_outputs == 0) {
|
|
tu_crb crb = cs->crb(3);
|
|
crb.add(VPC_SO_MAPPING_WPTR(CHIP, 0));
|
|
crb.add(VPC_SO_CNTL(CHIP, 0));
|
|
if (has_pc_dgen_so_cntl)
|
|
crb.add(PC_DGEN_SO_CNTL(CHIP, 0));
|
|
return;
|
|
}
|
|
|
|
for (unsigned i = 0; i < info->num_outputs; i++) {
|
|
const struct ir3_stream_output *out = &info->output[i];
|
|
unsigned k = out->register_index;
|
|
unsigned idx;
|
|
|
|
/* Skip it, if it's an output that was never assigned a register. */
|
|
if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
|
|
continue;
|
|
|
|
/* linkage map sorted by order frag shader wants things, so
|
|
* a bit less ideal here..
|
|
*/
|
|
for (idx = 0; idx < l->cnt; idx++)
|
|
if (l->var[idx].slot == v->outputs[k].slot)
|
|
break;
|
|
|
|
assert(idx < l->cnt);
|
|
|
|
for (unsigned j = 0; j < out->num_components; j++) {
|
|
unsigned c = j + out->start_component;
|
|
unsigned loc = l->var[idx].loc + c;
|
|
unsigned off = j + out->dst_offset; /* in dwords */
|
|
|
|
assert(loc < A6XX_SO_PROG_DWORDS * 2);
|
|
unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
|
|
if (loc & 1) {
|
|
prog[dword] |= A6XX_VPC_SO_MAPPING_PORT_B_EN |
|
|
A6XX_VPC_SO_MAPPING_PORT_B_BUF(out->output_buffer) |
|
|
A6XX_VPC_SO_MAPPING_PORT_B_OFF(off * 4);
|
|
} else {
|
|
prog[dword] |= A6XX_VPC_SO_MAPPING_PORT_A_EN |
|
|
A6XX_VPC_SO_MAPPING_PORT_A_BUF(out->output_buffer) |
|
|
A6XX_VPC_SO_MAPPING_PORT_A_OFF(off * 4);
|
|
}
|
|
BITSET_SET(valid_dwords, dword);
|
|
}
|
|
}
|
|
|
|
unsigned prog_count = 0;
|
|
unsigned start, end;
|
|
BITSET_FOREACH_RANGE(start, end, valid_dwords,
|
|
A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
|
|
prog_count += end - start + 1;
|
|
}
|
|
|
|
tu_crb crb = cs->crb(6 + prog_count);
|
|
|
|
crb.add(VPC_SO_CNTL(
|
|
CHIP,
|
|
.buf0_stream = info->stride[0] > 0 ? 1 + info->buffer_to_stream[0] : 0,
|
|
.buf1_stream = info->stride[1] > 0 ? 1 + info->buffer_to_stream[1] : 0,
|
|
.buf2_stream = info->stride[2] > 0 ? 1 + info->buffer_to_stream[2] : 0,
|
|
.buf3_stream = info->stride[3] > 0 ? 1 + info->buffer_to_stream[3] : 0,
|
|
.stream_enable = info->streams_written));
|
|
for (uint32_t i = 0; i < 4; i++) {
|
|
crb.add(VPC_SO_BUFFER_STRIDE(CHIP, i, info->stride[i]));
|
|
}
|
|
bool first = true;
|
|
BITSET_FOREACH_RANGE(start, end, valid_dwords,
|
|
A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
|
|
crb.add(VPC_SO_MAPPING_WPTR(CHIP, .addr = start, .reset = first));
|
|
for (unsigned i = start; i < end; i++) {
|
|
crb.add(VPC_SO_MAPPING_PORT(CHIP, .dword = prog[i]));
|
|
}
|
|
first = false;
|
|
}
|
|
|
|
if (has_pc_dgen_so_cntl) {
|
|
/* When present, setting this register makes sure that degenerate primitives
|
|
* are included in the stream output and not discarded.
|
|
*/
|
|
crb.add(PC_DGEN_SO_CNTL(CHIP, .stream_enable = info->streams_written));
|
|
}
|
|
}
|
|
|
|
enum tu_geom_consts_type
|
|
{
|
|
TU_CONSTS_PRIMITIVE_MAP,
|
|
TU_CONSTS_PRIMITIVE_PARAM,
|
|
};
|
|
|
|
static void
|
|
tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type,
|
|
const struct ir3_const_state *const_state,
|
|
unsigned constlen, enum a6xx_state_block block,
|
|
uint32_t offset, uint32_t size, const uint32_t *dwords) {
|
|
assert(size % 4 == 0);
|
|
dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
|
|
|
|
if (!cs->device->physical_device->info->props.load_shader_consts_via_preamble) {
|
|
uint32_t base;
|
|
switch (type) {
|
|
case TU_CONSTS_PRIMITIVE_MAP:
|
|
base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4;
|
|
break;
|
|
case TU_CONSTS_PRIMITIVE_PARAM:
|
|
base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4;
|
|
break;
|
|
default:
|
|
UNREACHABLE("bad consts type");
|
|
}
|
|
|
|
int32_t adjusted_size = MIN2(base * 4 + size, constlen * 4) - base * 4;
|
|
if (adjusted_size <= 0)
|
|
return;
|
|
|
|
tu_cs_emit_pkt7(cs, opcode, 3 + adjusted_size);
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
|
|
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
|
|
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
|
|
CP_LOAD_STATE6_0_STATE_BLOCK(block) |
|
|
CP_LOAD_STATE6_0_NUM_UNIT(adjusted_size / 4));
|
|
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
|
|
|
tu_cs_emit_array(cs, dwords, adjusted_size);
|
|
} else {
|
|
uint32_t base;
|
|
switch (type) {
|
|
case TU_CONSTS_PRIMITIVE_MAP:
|
|
base = const_state->primitive_map_ubo.idx;
|
|
break;
|
|
case TU_CONSTS_PRIMITIVE_PARAM:
|
|
base = const_state->primitive_param_ubo.idx;
|
|
break;
|
|
default:
|
|
UNREACHABLE("bad consts type");
|
|
}
|
|
if (base == -1)
|
|
return;
|
|
|
|
/* A7XX TODO: Emit data via sub_cs instead of NOP */
|
|
uint64_t iova = tu_cs_emit_data_nop(cs, dwords, size, 4);
|
|
|
|
tu_cs_emit_pkt7(cs, opcode, 5);
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
|
|
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
|
|
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
|
|
CP_LOAD_STATE6_0_STATE_BLOCK(block) |
|
|
CP_LOAD_STATE6_0_NUM_UNIT(1));
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
|
|
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
|
|
int size_vec4s = DIV_ROUND_UP(size, 4);
|
|
tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
|
|
}
|
|
}
|
|
|
|
static void
|
|
tu6_emit_link_map(struct tu_cs *cs,
|
|
const struct ir3_shader_variant *producer,
|
|
const struct ir3_shader_variant *consumer,
|
|
enum a6xx_state_block sb)
|
|
{
|
|
const struct ir3_const_state *const_state = ir3_const_state(consumer);
|
|
uint32_t size = align(consumer->input_size, 4);
|
|
|
|
if (size == 0)
|
|
return;
|
|
|
|
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_MAP,
|
|
const_state, consumer->constlen, sb, 0, size, producer->output_loc);
|
|
}
|
|
|
|
static int
|
|
tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
|
|
const struct ir3_shader_variant *last_shader,
|
|
uint32_t index,
|
|
uint8_t *interp_mode,
|
|
uint8_t *ps_repl_mode)
|
|
{
|
|
const uint32_t compmask = fs->inputs[index].compmask;
|
|
|
|
/* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
|
|
* fourth component occupy three consecutive varying slots
|
|
*/
|
|
int shift = 0;
|
|
*interp_mode = 0;
|
|
*ps_repl_mode = 0;
|
|
if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
|
|
if (compmask & 0x1) {
|
|
*ps_repl_mode |= PS_REPL_S << shift;
|
|
shift += 2;
|
|
}
|
|
if (compmask & 0x2) {
|
|
*ps_repl_mode |= PS_REPL_T << shift;
|
|
shift += 2;
|
|
}
|
|
if (compmask & 0x4) {
|
|
*interp_mode |= INTERP_ZERO << shift;
|
|
shift += 2;
|
|
}
|
|
if (compmask & 0x8) {
|
|
*interp_mode |= INTERP_ONE << 6;
|
|
shift += 2;
|
|
}
|
|
} else if (fs->inputs[index].slot == VARYING_SLOT_LAYER ||
|
|
fs->inputs[index].slot == VARYING_SLOT_VIEWPORT) {
|
|
/* If the last geometry shader doesn't statically write these, they're
|
|
* implicitly zero and the FS is supposed to read zero.
|
|
*/
|
|
const gl_varying_slot slot = (gl_varying_slot) fs->inputs[index].slot;
|
|
if (ir3_find_output(last_shader, slot) < 0 &&
|
|
(compmask & 0x1)) {
|
|
*interp_mode |= INTERP_ZERO;
|
|
} else {
|
|
*interp_mode |= INTERP_FLAT;
|
|
}
|
|
} else if (fs->inputs[index].flat) {
|
|
for (int i = 0; i < 4; i++) {
|
|
if (compmask & (1 << i)) {
|
|
*interp_mode |= INTERP_FLAT << shift;
|
|
shift += 2;
|
|
}
|
|
}
|
|
}
|
|
|
|
return util_bitcount(compmask) * 2;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_vpc_varying_modes(struct tu_cs *cs,
|
|
const struct ir3_shader_variant *fs,
|
|
const struct ir3_shader_variant *last_shader)
|
|
{
|
|
uint32_t interp_modes[8] = { 0 };
|
|
uint32_t ps_repl_modes[8] = { 0 };
|
|
uint32_t interp_regs = 0;
|
|
|
|
if (fs) {
|
|
for (int i = -1;
|
|
(i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
|
|
|
|
/* get the mode for input i */
|
|
uint8_t interp_mode;
|
|
uint8_t ps_repl_mode;
|
|
const int bits =
|
|
tu6_vpc_varying_mode(fs, last_shader, i, &interp_mode, &ps_repl_mode);
|
|
|
|
/* OR the mode into the array */
|
|
const uint32_t inloc = fs->inputs[i].inloc * 2;
|
|
uint32_t n = inloc / 32;
|
|
uint32_t shift = inloc % 32;
|
|
interp_modes[n] |= interp_mode << shift;
|
|
ps_repl_modes[n] |= ps_repl_mode << shift;
|
|
if (shift + bits > 32) {
|
|
n++;
|
|
shift = 32 - shift;
|
|
|
|
interp_modes[n] |= interp_mode >> shift;
|
|
ps_repl_modes[n] |= ps_repl_mode >> shift;
|
|
}
|
|
interp_regs = MAX2(interp_regs, n + 1);
|
|
}
|
|
}
|
|
|
|
if (interp_regs) {
|
|
tu_cs_emit_pkt4(cs, VPC_VARYING_INTERP_MODE_MODE(CHIP, 0).reg, interp_regs);
|
|
tu_cs_emit_array(cs, interp_modes, interp_regs);
|
|
|
|
tu_cs_emit_pkt4(cs, VPC_VARYING_REPLACE_MODE_MODE(CHIP, 0).reg, interp_regs);
|
|
tu_cs_emit_array(cs, ps_repl_modes, interp_regs);
|
|
}
|
|
}
|
|
|
|
template <chip CHIP>
|
|
void
|
|
tu6_emit_vpc(struct tu_cs *cs,
|
|
const struct ir3_shader_variant *vs,
|
|
const struct ir3_shader_variant *hs,
|
|
const struct ir3_shader_variant *ds,
|
|
const struct ir3_shader_variant *gs,
|
|
const struct ir3_shader_variant *fs)
|
|
{
|
|
/* note: doesn't compile as static because of the array regs.. */
|
|
const struct reg_config {
|
|
uint16_t reg_sp_xs_out_reg;
|
|
uint16_t reg_sp_xs_vpc_dst_reg;
|
|
uint16_t reg_vpc_xs_pack;
|
|
uint16_t reg_vpc_xs_clip_cntl;
|
|
uint16_t reg_vpc_xs_clip_cntl_v2;
|
|
uint16_t reg_gras_xs_cl_cntl;
|
|
uint16_t reg_pc_xs_out_cntl;
|
|
uint16_t reg_sp_xs_primitive_cntl;
|
|
uint16_t reg_vpc_xs_layer_cntl;
|
|
uint16_t reg_vpc_xs_layer_cntl_v2;
|
|
uint16_t reg_gras_xs_layer_cntl;
|
|
} reg_config[] = {
|
|
[MESA_SHADER_VERTEX] = {
|
|
REG_A6XX_SP_VS_OUTPUT_REG(0),
|
|
REG_A6XX_SP_VS_VPC_DEST_REG(0),
|
|
REG_A6XX_VPC_VS_CNTL,
|
|
REG_A6XX_VPC_VS_CLIP_CULL_CNTL,
|
|
REG_A6XX_VPC_VS_CLIP_CULL_CNTL_V2,
|
|
REG_A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE,
|
|
REG_A6XX_PC_VS_CNTL,
|
|
REG_A6XX_SP_VS_OUTPUT_CNTL,
|
|
REG_A6XX_VPC_VS_SIV_CNTL,
|
|
REG_A6XX_VPC_VS_SIV_CNTL_V2,
|
|
REG_A6XX_GRAS_SU_VS_SIV_CNTL,
|
|
},
|
|
[MESA_SHADER_TESS_CTRL] = {
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
0,
|
|
REG_A6XX_PC_HS_CNTL,
|
|
0,
|
|
0,
|
|
0
|
|
},
|
|
[MESA_SHADER_TESS_EVAL] = {
|
|
REG_A6XX_SP_DS_OUTPUT_REG(0),
|
|
REG_A6XX_SP_DS_VPC_DEST_REG(0),
|
|
REG_A6XX_VPC_DS_CNTL,
|
|
REG_A6XX_VPC_DS_CLIP_CULL_CNTL,
|
|
REG_A6XX_VPC_DS_CLIP_CULL_CNTL_V2,
|
|
REG_A6XX_GRAS_CL_DS_CLIP_CULL_DISTANCE,
|
|
REG_A6XX_PC_DS_CNTL,
|
|
REG_A6XX_SP_DS_OUTPUT_CNTL,
|
|
REG_A6XX_VPC_DS_SIV_CNTL,
|
|
REG_A6XX_VPC_DS_SIV_CNTL_V2,
|
|
REG_A6XX_GRAS_SU_DS_SIV_CNTL,
|
|
},
|
|
[MESA_SHADER_GEOMETRY] = {
|
|
REG_A6XX_SP_GS_OUTPUT_REG(0),
|
|
REG_A6XX_SP_GS_VPC_DEST_REG(0),
|
|
REG_A6XX_VPC_GS_CNTL,
|
|
REG_A6XX_VPC_GS_CLIP_CULL_CNTL,
|
|
REG_A6XX_VPC_GS_CLIP_CULL_CNTL_V2,
|
|
REG_A6XX_GRAS_CL_GS_CLIP_CULL_DISTANCE,
|
|
REG_A6XX_PC_GS_CNTL,
|
|
REG_A6XX_SP_GS_OUTPUT_CNTL,
|
|
REG_A6XX_VPC_GS_SIV_CNTL,
|
|
REG_A6XX_VPC_GS_SIV_CNTL_V2,
|
|
REG_A6XX_GRAS_SU_GS_SIV_CNTL,
|
|
},
|
|
};
|
|
|
|
const struct ir3_shader_variant *last_shader;
|
|
if (gs) {
|
|
last_shader = gs;
|
|
} else if (hs) {
|
|
last_shader = ds;
|
|
} else {
|
|
last_shader = vs;
|
|
}
|
|
|
|
const struct reg_config *cfg = ®_config[last_shader->type];
|
|
|
|
struct ir3_shader_linkage linkage = {
|
|
.primid_loc = 0xff,
|
|
.clip0_loc = 0xff,
|
|
.clip1_loc = 0xff,
|
|
};
|
|
if (fs)
|
|
ir3_link_shaders(&linkage, last_shader, fs, true);
|
|
|
|
if (last_shader->stream_output.num_outputs)
|
|
ir3_link_stream_out(&linkage, last_shader);
|
|
|
|
/* a6xx finds position/pointsize at the end */
|
|
const uint32_t pointsize_regid =
|
|
ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
|
|
const uint32_t layer_regid =
|
|
ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
|
|
const uint32_t view_regid =
|
|
ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
|
|
const uint32_t clip0_regid =
|
|
ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
|
|
const uint32_t clip1_regid =
|
|
ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
|
|
uint32_t flags_regid = gs ?
|
|
ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
|
|
const uint32_t shading_rate_regid =
|
|
ir3_find_output_regid(last_shader, VARYING_SLOT_PRIMITIVE_SHADING_RATE);
|
|
|
|
uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
|
|
uint32_t shading_rate_loc = 0xff;
|
|
|
|
if (layer_regid != regid(63, 0)) {
|
|
layer_loc = linkage.max_loc;
|
|
ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
|
|
}
|
|
|
|
if (view_regid != regid(63, 0)) {
|
|
view_loc = linkage.max_loc;
|
|
ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
|
|
}
|
|
|
|
if (shading_rate_regid != regid(63, 0)) {
|
|
shading_rate_loc = linkage.max_loc;
|
|
ir3_link_add(&linkage, VARYING_SLOT_PRIMITIVE_SHADING_RATE,
|
|
shading_rate_regid, 0x1, linkage.max_loc);
|
|
}
|
|
|
|
unsigned extra_pos = 0;
|
|
|
|
for (unsigned i = 0; i < last_shader->outputs_count; i++) {
|
|
if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
|
|
continue;
|
|
|
|
if (position_loc == 0xff)
|
|
position_loc = linkage.max_loc;
|
|
|
|
ir3_link_add(&linkage, last_shader->outputs[i].slot,
|
|
last_shader->outputs[i].regid,
|
|
0xf, position_loc + 4 * last_shader->outputs[i].view);
|
|
extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
|
|
}
|
|
|
|
if (pointsize_regid != regid(63, 0)) {
|
|
pointsize_loc = linkage.max_loc;
|
|
ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
|
|
}
|
|
|
|
uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
|
|
|
|
/* Handle the case where clip/cull distances aren't read by the FS */
|
|
uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
|
|
if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
|
|
clip0_loc = linkage.max_loc;
|
|
ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
|
|
clip_cull_mask & 0xf, linkage.max_loc);
|
|
}
|
|
if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
|
|
clip1_loc = linkage.max_loc;
|
|
ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
|
|
clip_cull_mask >> 4, linkage.max_loc);
|
|
}
|
|
|
|
tu6_setup_streamout<CHIP>(cs, last_shader, &linkage);
|
|
|
|
/* There is a hardware bug on a750 where STRIDE_IN_VPC of 5 to 8 in GS with
|
|
* an input primitive type with adjacency, an output primitive type of
|
|
* points, and a high enough vertex count causes a hang.
|
|
*/
|
|
if (cs->device->physical_device->info->props.gs_vpc_adjacency_quirk &&
|
|
gs && gs->gs.output_primitive == MESA_PRIM_POINTS &&
|
|
linkage.max_loc > 4) {
|
|
linkage.max_loc = MAX2(linkage.max_loc, 9);
|
|
}
|
|
|
|
/* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
|
|
* at least when a DS is the last stage, so add a dummy output to keep it
|
|
* happy if there aren't any. We do this late in order to avoid emitting
|
|
* any unused code and make sure that optimizations don't remove it.
|
|
*/
|
|
if (linkage.cnt == 0)
|
|
ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
|
|
|
|
/* map outputs of the last shader to VPC */
|
|
assert(linkage.cnt <= 32);
|
|
const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
|
|
const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
|
|
uint32_t sp_out[16] = {0};
|
|
uint32_t sp_vpc_dst[8] = {0};
|
|
for (uint32_t i = 0; i < linkage.cnt; i++) {
|
|
((uint16_t *) sp_out)[i] =
|
|
A6XX_SP_VS_OUTPUT_REG_A_REGID(linkage.var[i].regid) |
|
|
A6XX_SP_VS_OUTPUT_REG_A_COMPMASK(linkage.var[i].compmask);
|
|
((uint8_t *) sp_vpc_dst)[i] =
|
|
A6XX_SP_VS_VPC_DEST_REG_OUTLOC0(linkage.var[i].loc);
|
|
}
|
|
|
|
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
|
|
tu_cs_emit_array(cs, sp_out, sp_out_count);
|
|
|
|
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
|
|
tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
|
|
|
|
tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
|
|
tu_cs_emit(cs, A6XX_VPC_VS_CNTL_POSITIONLOC(position_loc) |
|
|
A6XX_VPC_VS_CNTL_PSIZELOC(pointsize_loc) |
|
|
A6XX_VPC_VS_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
|
|
A6XX_VPC_VS_CNTL_EXTRAPOS(extra_pos));
|
|
|
|
tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
|
|
tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_MASK(clip_cull_mask) |
|
|
A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
|
|
A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_47_LOC(clip1_loc));
|
|
tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl_v2, 1);
|
|
tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_MASK(clip_cull_mask) |
|
|
A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
|
|
A6XX_VPC_VS_CLIP_CULL_CNTL_CLIP_DIST_47_LOC(clip1_loc));
|
|
|
|
tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
|
|
tu_cs_emit(cs, A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE_CLIP_MASK(last_shader->clip_mask) |
|
|
A6XX_GRAS_CL_VS_CLIP_CULL_DISTANCE_CULL_MASK(last_shader->cull_mask));
|
|
|
|
const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
|
|
const struct ir3_shader_variant *shader = geom_shaders[i];
|
|
if (!shader)
|
|
continue;
|
|
|
|
bool primid = shader->type != MESA_SHADER_VERTEX &&
|
|
VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
|
|
|
|
tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
|
|
if (shader == last_shader) {
|
|
tu_cs_emit(cs, A6XX_PC_VS_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
|
|
CONDREG(pointsize_regid, A6XX_PC_VS_CNTL_PSIZE) |
|
|
CONDREG(layer_regid, A6XX_PC_VS_CNTL_LAYER) |
|
|
CONDREG(view_regid, A6XX_PC_VS_CNTL_VIEW) |
|
|
COND(primid, A6XX_PC_VS_CNTL_PRIMITIVE_ID) |
|
|
A6XX_PC_VS_CNTL_CLIP_MASK(clip_cull_mask) |
|
|
CONDREG(shading_rate_regid, A6XX_PC_VS_CNTL_SHADINGRATE));
|
|
} else {
|
|
tu_cs_emit(cs, COND(primid, A6XX_PC_VS_CNTL_PRIMITIVE_ID));
|
|
}
|
|
}
|
|
|
|
/* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
|
|
if (gs)
|
|
assert(flags_regid != INVALID_REG);
|
|
|
|
tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
|
|
tu_cs_emit(cs, A6XX_SP_VS_OUTPUT_CNTL_OUT(linkage.cnt) |
|
|
A6XX_SP_GS_OUTPUT_CNTL_FLAGS_REGID(flags_regid));
|
|
|
|
tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
|
|
tu_cs_emit(cs, A6XX_VPC_VS_SIV_CNTL_LAYERLOC(layer_loc) |
|
|
A6XX_VPC_VS_SIV_CNTL_VIEWLOC(view_loc) |
|
|
A6XX_VPC_VS_SIV_CNTL_SHADINGRATELOC(shading_rate_loc));
|
|
tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl_v2, 1);
|
|
tu_cs_emit(cs, A6XX_VPC_VS_SIV_CNTL_LAYERLOC(layer_loc) |
|
|
A6XX_VPC_VS_SIV_CNTL_VIEWLOC(view_loc) |
|
|
A6XX_VPC_VS_SIV_CNTL_SHADINGRATELOC(shading_rate_loc));
|
|
|
|
tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
|
|
tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_SU_VS_SIV_CNTL_WRITES_LAYER) |
|
|
CONDREG(view_regid, A6XX_GRAS_SU_VS_SIV_CNTL_WRITES_VIEW));
|
|
|
|
tu6_emit_vpc_varying_modes<CHIP>(cs, fs, last_shader);
|
|
}
|
|
TU_GENX(tu6_emit_vpc);
|
|
|
|
static void
|
|
tu6_emit_vs_params(struct tu_cs *cs,
|
|
const struct ir3_const_state *const_state,
|
|
unsigned constlen,
|
|
unsigned param_stride,
|
|
unsigned num_vertices)
|
|
{
|
|
uint32_t vs_params[4] = {
|
|
param_stride * num_vertices * 4, /* vs primitive stride */
|
|
param_stride * 4, /* vs vertex stride */
|
|
0,
|
|
0,
|
|
};
|
|
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
|
|
const_state, constlen, SB6_VS_SHADER, 0,
|
|
ARRAY_SIZE(vs_params), vs_params);
|
|
}
|
|
|
|
static void
|
|
tu_get_tess_iova(struct tu_device *dev,
|
|
uint64_t *tess_factor_iova,
|
|
uint64_t *tess_param_iova)
|
|
{
|
|
/* Create the shared tess factor BO the first time tess is used on the device. */
|
|
if (!dev->tess_bo) {
|
|
mtx_lock(&dev->mutex);
|
|
if (!dev->tess_bo) {
|
|
tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS_BO_SIZE,
|
|
TU_BO_ALLOC_INTERNAL_RESOURCE, "tess");
|
|
}
|
|
mtx_unlock(&dev->mutex);
|
|
}
|
|
|
|
*tess_factor_iova = dev->tess_bo->iova;
|
|
*tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = {
|
|
MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS,
|
|
};
|
|
|
|
#define HS_PARAMS_SIZE 8
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_patch_control_points_size(struct tu_device *dev,
|
|
const struct tu_shader *vs,
|
|
const struct tu_shader *tcs,
|
|
const struct tu_shader *tes,
|
|
const struct tu_program_state *program,
|
|
uint32_t patch_control_points)
|
|
{
|
|
if (dev->physical_device->info->props.load_shader_consts_via_preamble) {
|
|
#define EMIT_CONST_DWORDS(const_dwords) (6 + const_dwords + 4)
|
|
return EMIT_CONST_DWORDS(4) +
|
|
EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
|
|
#undef EMIT_CONST_DWORDS
|
|
} else {
|
|
#define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords)
|
|
return EMIT_CONST_DWORDS(4) +
|
|
EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
|
|
#undef EMIT_CONST_DWORDS
|
|
}
|
|
}
|
|
|
|
template <chip CHIP>
|
|
void
|
|
tu6_emit_patch_control_points(struct tu_cs *cs,
|
|
const struct tu_shader *vs,
|
|
const struct tu_shader *tcs,
|
|
const struct tu_shader *tes,
|
|
const struct tu_program_state *program,
|
|
uint32_t patch_control_points)
|
|
{
|
|
if (!tcs->variant)
|
|
return;
|
|
|
|
struct tu_device *dev = cs->device;
|
|
|
|
tu6_emit_vs_params(cs,
|
|
&program->link[MESA_SHADER_VERTEX].const_state,
|
|
program->link[MESA_SHADER_VERTEX].constlen,
|
|
vs->variant->output_size,
|
|
patch_control_points);
|
|
|
|
uint64_t tess_factor_iova, tess_param_iova;
|
|
tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
|
|
|
|
uint32_t hs_params[HS_PARAMS_SIZE] = {
|
|
vs->variant->output_size * patch_control_points * 4, /* hs primitive stride */
|
|
vs->variant->output_size * 4, /* hs vertex stride */
|
|
tcs->variant->output_size,
|
|
patch_control_points,
|
|
tess_param_iova,
|
|
tess_param_iova >> 32,
|
|
tess_factor_iova,
|
|
tess_factor_iova >> 32,
|
|
};
|
|
|
|
const struct ir3_const_state *hs_const =
|
|
&program->link[MESA_SHADER_TESS_CTRL].const_state;
|
|
unsigned hs_constlen = program->link[MESA_SHADER_TESS_CTRL].constlen;
|
|
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
|
|
hs_const, hs_constlen, SB6_HS_SHADER, 0,
|
|
ARRAY_SIZE(hs_params), hs_params);
|
|
|
|
uint32_t patch_local_mem_size_16b =
|
|
patch_control_points * vs->variant->output_size / 4;
|
|
|
|
/* Total attribute slots in HS incoming patch. */
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_PARAM_1, 1);
|
|
tu_cs_emit(cs, patch_local_mem_size_16b);
|
|
|
|
const uint32_t wavesize = 64;
|
|
const uint32_t vs_hs_local_mem_size = 16384;
|
|
|
|
uint32_t max_patches_per_wave;
|
|
if (dev->physical_device->info->props.tess_use_shared) {
|
|
/* HS invocations for a patch are always within the same wave,
|
|
* making barriers less expensive. VS can't have barriers so we
|
|
* don't care about VS invocations being in the same wave.
|
|
*/
|
|
max_patches_per_wave = wavesize / tcs->variant->tess.tcs_vertices_out;
|
|
} else {
|
|
/* VS is also in the same wave */
|
|
max_patches_per_wave =
|
|
wavesize / MAX2(patch_control_points,
|
|
tcs->variant->tess.tcs_vertices_out);
|
|
}
|
|
|
|
uint32_t patches_per_wave =
|
|
MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
|
|
max_patches_per_wave);
|
|
|
|
uint32_t wave_input_size = DIV_ROUND_UP(
|
|
patches_per_wave * patch_local_mem_size_16b * 16, 256);
|
|
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_CNTL_1, 1);
|
|
tu_cs_emit(cs, wave_input_size);
|
|
|
|
/* maximum number of patches that can fit in tess factor/param buffers */
|
|
uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
|
|
TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4));
|
|
/* convert from # of patches to draw count */
|
|
subdraw_size *= patch_control_points;
|
|
|
|
tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
|
|
tu_cs_emit(cs, subdraw_size);
|
|
}
|
|
|
|
static void
|
|
tu6_emit_geom_tess_consts(struct tu_cs *cs,
|
|
const struct ir3_shader_variant *vs,
|
|
const struct ir3_shader_variant *hs,
|
|
const struct ir3_shader_variant *ds,
|
|
const struct ir3_shader_variant *gs)
|
|
{
|
|
struct tu_device *dev = cs->device;
|
|
|
|
if (gs && !hs) {
|
|
tu6_emit_vs_params(cs, ir3_const_state(vs), vs->constlen,
|
|
vs->output_size, gs->gs.vertices_in);
|
|
}
|
|
|
|
if (hs) {
|
|
uint64_t tess_factor_iova, tess_param_iova;
|
|
tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
|
|
|
|
uint32_t ds_params[8] = {
|
|
gs ? ds->output_size * gs->gs.vertices_in * 4 : 0, /* ds primitive stride */
|
|
ds->output_size * 4, /* ds vertex stride */
|
|
hs->output_size, /* hs vertex stride (dwords) */
|
|
hs->tess.tcs_vertices_out,
|
|
tess_param_iova,
|
|
tess_param_iova >> 32,
|
|
tess_factor_iova,
|
|
tess_factor_iova >> 32,
|
|
};
|
|
|
|
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
|
|
ds->const_state, ds->constlen, SB6_DS_SHADER, 0,
|
|
ARRAY_SIZE(ds_params), ds_params);
|
|
}
|
|
|
|
if (gs) {
|
|
const struct ir3_shader_variant *prev = ds ? ds : vs;
|
|
uint32_t gs_params[4] = {
|
|
prev->output_size * gs->gs.vertices_in * 4, /* gs primitive stride */
|
|
prev->output_size * 4, /* gs vertex stride */
|
|
0,
|
|
0,
|
|
};
|
|
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
|
|
gs->const_state, gs->constlen, SB6_GS_SHADER, 0,
|
|
ARRAY_SIZE(gs_params), gs_params);
|
|
}
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_program_config(struct tu_cs *cs,
|
|
const struct tu_program_state *prog,
|
|
struct tu_shader **shaders,
|
|
const struct ir3_shader_variant **variants)
|
|
{
|
|
STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
|
|
|
|
tu_crb crb = cs->crb(0);
|
|
|
|
bool shared_consts_enable =
|
|
prog->shared_consts.type == IR3_PUSH_CONSTS_SHARED;
|
|
tu6_emit_shared_consts_enable<CHIP>(crb, shared_consts_enable);
|
|
|
|
crb.add(SP_UPDATE_CNTL(CHIP, .vs_state = true, .hs_state = true,
|
|
.ds_state = true, .gs_state = true,
|
|
.fs_state = true, .gfx_uav = true,
|
|
.gfx_shared_const = shared_consts_enable));
|
|
|
|
const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
|
|
const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
|
|
const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
|
|
const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
|
|
const struct ir3_shader_variant *fs = variants[MESA_SHADER_FRAGMENT];
|
|
|
|
tu6_emit_xs_config<CHIP>(crb, { .vs = vs, .hs = hs, .ds = ds, .gs = gs, .fs = fs });
|
|
|
|
crb.flush();
|
|
|
|
for (size_t stage_idx = MESA_SHADER_VERTEX;
|
|
stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
|
|
mesa_shader_stage stage = (mesa_shader_stage) stage_idx;
|
|
tu6_emit_dynamic_offset(cs, variants[stage], shaders[stage], prog);
|
|
}
|
|
|
|
if (hs) {
|
|
tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
|
|
tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
|
|
}
|
|
|
|
if (gs) {
|
|
if (hs) {
|
|
tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
|
|
} else {
|
|
tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
|
|
}
|
|
|
|
uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
|
|
|
|
if (CHIP == A6XX) {
|
|
/* Size of per-primitive alloction in ldlw memory in vec4s. */
|
|
uint32_t vec4_size = gs->gs.vertices_in *
|
|
DIV_ROUND_UP(prev_stage_output_size, 4);
|
|
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
|
|
tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
|
|
}
|
|
|
|
uint32_t prim_size = prev_stage_output_size;
|
|
if (prim_size > 64)
|
|
prim_size = 64;
|
|
else if (prim_size == 64)
|
|
prim_size = 63;
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_CNTL_1, 1);
|
|
tu_cs_emit(cs, prim_size);
|
|
}
|
|
|
|
if (gs || hs) {
|
|
tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
|
|
}
|
|
}
|
|
|
|
static bool
|
|
contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)
|
|
{
|
|
return (state &
|
|
(VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) ==
|
|
(VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT);
|
|
}
|
|
|
|
static bool
|
|
pipeline_contains_all_shader_state(struct tu_pipeline *pipeline)
|
|
{
|
|
return pipeline->type == TU_PIPELINE_GRAPHICS ||
|
|
pipeline->type == TU_PIPELINE_COMPUTE ||
|
|
contains_all_shader_state(tu_pipeline_to_graphics_lib(pipeline)->state);
|
|
}
|
|
|
|
/* Return true if this pipeline contains all of the GPL stages listed but none
|
|
* of the libraries it uses do, so this is "the first time" that all of them
|
|
* are defined together. This is useful for state that needs to be combined
|
|
* from multiple GPL stages.
|
|
*/
|
|
|
|
static bool
|
|
set_combined_state(struct tu_pipeline_builder *builder,
|
|
struct tu_pipeline *pipeline,
|
|
VkGraphicsPipelineLibraryFlagsEXT state)
|
|
{
|
|
if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB &&
|
|
(tu_pipeline_to_graphics_lib(pipeline)->state & state) != state)
|
|
return false;
|
|
|
|
for (unsigned i = 0; i < builder->num_libraries; i++) {
|
|
if ((builder->libraries[i]->state & state) == state)
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
#define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 2 + 1)
|
|
|
|
static VkResult
|
|
tu_pipeline_allocate_cs(struct tu_device *dev,
|
|
struct tu_pipeline *pipeline,
|
|
struct tu_pipeline_layout *layout,
|
|
struct tu_pipeline_builder *builder,
|
|
const struct ir3_shader_variant *compute)
|
|
{
|
|
uint32_t size = 1024;
|
|
|
|
/* graphics case: */
|
|
if (builder) {
|
|
if (builder->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
|
|
size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
|
|
}
|
|
|
|
if (set_combined_state(builder, pipeline,
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
|
|
size += tu6_load_state_size(pipeline, layout);
|
|
}
|
|
} else {
|
|
size += tu6_load_state_size(pipeline, layout);
|
|
}
|
|
|
|
/* Allocate the space for the pipeline out of the device's RO suballocator.
|
|
*
|
|
* Sub-allocating BOs saves memory and also kernel overhead in refcounting of
|
|
* BOs at exec time.
|
|
*
|
|
* The pipeline cache would seem like a natural place to stick the
|
|
* suballocator, except that it is not guaranteed to outlive the pipelines
|
|
* created from it, so you can't store any long-lived state there, and you
|
|
* can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
|
|
* pipeline destroy isn't synchronized by the cache.
|
|
*/
|
|
mtx_lock(&dev->pipeline_mutex);
|
|
VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
|
|
size * 4, 128);
|
|
mtx_unlock(&dev->pipeline_mutex);
|
|
if (result != VK_SUCCESS)
|
|
return result;
|
|
|
|
TU_RMV(cmd_buffer_suballoc_bo_create, dev, &pipeline->bo);
|
|
tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static void
|
|
tu_append_executable(struct tu_pipeline *pipeline,
|
|
const struct ir3_shader_variant *variant,
|
|
char *nir_from_spirv)
|
|
{
|
|
struct tu_pipeline_executable exe = {
|
|
.stage = variant->type,
|
|
.stats = variant->info,
|
|
.is_binning = variant->binning_pass,
|
|
.nir_from_spirv = nir_from_spirv,
|
|
.nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
|
|
.disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
|
|
};
|
|
|
|
util_dynarray_append(&pipeline->executables, exe);
|
|
}
|
|
|
|
static void
|
|
tu_hash_stage(struct mesa_sha1 *ctx,
|
|
VkPipelineCreateFlags2KHR pipeline_flags,
|
|
const VkPipelineShaderStageCreateInfo *stage,
|
|
const nir_shader *nir,
|
|
const struct tu_shader_key *key)
|
|
{
|
|
|
|
if (nir) {
|
|
struct blob blob;
|
|
blob_init(&blob);
|
|
nir_serialize(&blob, nir, true);
|
|
_mesa_sha1_update(ctx, blob.data, blob.size);
|
|
blob_finish(&blob);
|
|
} else {
|
|
unsigned char stage_hash[SHA1_DIGEST_LENGTH];
|
|
vk_pipeline_hash_shader_stage(pipeline_flags, stage, NULL, stage_hash);
|
|
_mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
|
|
}
|
|
_mesa_sha1_update(ctx, key, sizeof(*key));
|
|
}
|
|
|
|
static void
|
|
tu_hash_shaders(unsigned char *hash,
|
|
VkPipelineCreateFlags2KHR pipeline_flags,
|
|
const VkPipelineShaderStageCreateInfo **stages,
|
|
nir_shader *const *nir,
|
|
const struct tu_pipeline_layout *layout,
|
|
const struct tu_shader_key *keys,
|
|
VkGraphicsPipelineLibraryFlagsEXT state)
|
|
{
|
|
struct mesa_sha1 ctx;
|
|
|
|
_mesa_sha1_init(&ctx);
|
|
|
|
if (layout)
|
|
_mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
|
|
|
|
for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
|
|
if (stages[i] || nir[i]) {
|
|
tu_hash_stage(&ctx, pipeline_flags, stages[i], nir[i], &keys[i]);
|
|
}
|
|
}
|
|
_mesa_sha1_update(&ctx, &state, sizeof(state));
|
|
enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
|
|
_mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));
|
|
_mesa_sha1_final(&ctx, hash);
|
|
}
|
|
|
|
static void
|
|
tu_hash_compute(unsigned char *hash,
|
|
VkPipelineCreateFlags2KHR pipeline_flags,
|
|
const VkPipelineShaderStageCreateInfo *stage,
|
|
const struct tu_pipeline_layout *layout,
|
|
const struct tu_shader_key *key)
|
|
{
|
|
struct mesa_sha1 ctx;
|
|
|
|
_mesa_sha1_init(&ctx);
|
|
|
|
if (layout)
|
|
_mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
|
|
|
|
tu_hash_stage(&ctx, pipeline_flags, stage, NULL, key);
|
|
enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
|
|
_mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));
|
|
|
|
_mesa_sha1_final(&ctx, hash);
|
|
}
|
|
|
|
static struct tu_shader *
|
|
tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
|
|
const void *key_data, size_t key_size,
|
|
bool *application_cache_hit)
|
|
{
|
|
struct vk_pipeline_cache_object *object =
|
|
vk_pipeline_cache_lookup_object(cache, key_data, key_size,
|
|
&tu_shader_ops, application_cache_hit);
|
|
if (object)
|
|
return container_of(object, struct tu_shader, base);
|
|
else
|
|
return NULL;
|
|
}
|
|
|
|
static struct tu_shader *
|
|
tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
|
|
struct tu_shader *shader)
|
|
{
|
|
struct vk_pipeline_cache_object *object =
|
|
vk_pipeline_cache_add_object(cache, &shader->base);
|
|
return container_of(object, struct tu_shader, base);
|
|
}
|
|
|
|
static bool
|
|
tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
|
|
struct blob *blob);
|
|
|
|
static struct vk_pipeline_cache_object *
|
|
tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
|
|
const void *key_data,
|
|
size_t key_size,
|
|
struct blob_reader *blob);
|
|
|
|
static void
|
|
tu_nir_shaders_destroy(struct vk_device *device,
|
|
struct vk_pipeline_cache_object *object)
|
|
{
|
|
struct tu_nir_shaders *shaders =
|
|
container_of(object, struct tu_nir_shaders, base);
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++)
|
|
ralloc_free(shaders->nir[i]);
|
|
|
|
vk_pipeline_cache_object_finish(&shaders->base);
|
|
vk_free(&device->alloc, shaders);
|
|
}
|
|
|
|
const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = {
|
|
.serialize = tu_nir_shaders_serialize,
|
|
.deserialize = tu_nir_shaders_deserialize,
|
|
.destroy = tu_nir_shaders_destroy,
|
|
};
|
|
|
|
static struct tu_nir_shaders *
|
|
tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
|
|
{
|
|
VK_MULTIALLOC(ma);
|
|
VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1);
|
|
VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
|
|
|
|
if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
|
|
return NULL;
|
|
|
|
memcpy(obj_key_data, key_data, key_size);
|
|
vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
|
|
&tu_nir_shaders_ops, obj_key_data, key_size);
|
|
|
|
return shaders;
|
|
}
|
|
|
|
static bool
|
|
tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
|
|
struct blob *blob)
|
|
{
|
|
struct tu_nir_shaders *shaders =
|
|
container_of(object, struct tu_nir_shaders, base);
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
|
|
if (shaders->nir[i]) {
|
|
blob_write_uint8(blob, 1);
|
|
nir_serialize(blob, shaders->nir[i], true);
|
|
} else {
|
|
blob_write_uint8(blob, 0);
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static struct vk_pipeline_cache_object *
|
|
tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
|
|
const void *key_data,
|
|
size_t key_size,
|
|
struct blob_reader *blob)
|
|
{
|
|
struct tu_device *dev =
|
|
container_of(cache->base.device, struct tu_device, vk);
|
|
struct tu_nir_shaders *shaders =
|
|
tu_nir_shaders_init(dev, key_data, key_size);
|
|
|
|
if (!shaders)
|
|
return NULL;
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
|
|
if (blob_read_uint8(blob)) {
|
|
shaders->nir[i] =
|
|
nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob);
|
|
}
|
|
}
|
|
|
|
return &shaders->base;
|
|
}
|
|
|
|
static struct tu_nir_shaders *
|
|
tu_nir_cache_lookup(struct vk_pipeline_cache *cache,
|
|
const void *key_data, size_t key_size,
|
|
bool *application_cache_hit)
|
|
{
|
|
struct vk_pipeline_cache_object *object =
|
|
vk_pipeline_cache_lookup_object(cache, key_data, key_size,
|
|
&tu_nir_shaders_ops, application_cache_hit);
|
|
if (object)
|
|
return container_of(object, struct tu_nir_shaders, base);
|
|
else
|
|
return NULL;
|
|
}
|
|
|
|
static struct tu_nir_shaders *
|
|
tu_nir_cache_insert(struct vk_pipeline_cache *cache,
|
|
struct tu_nir_shaders *shaders)
|
|
{
|
|
struct vk_pipeline_cache_object *object =
|
|
vk_pipeline_cache_add_object(cache, &shaders->base);
|
|
return container_of(object, struct tu_nir_shaders, base);
|
|
}
|
|
|
|
static VkResult
|
|
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
|
|
struct tu_pipeline *pipeline)
|
|
{
|
|
VkResult result = VK_SUCCESS;
|
|
const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
|
|
NULL
|
|
};
|
|
VkPipelineCreationFeedback pipeline_feedback = {
|
|
.flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
|
|
};
|
|
VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
|
|
|
|
const bool executable_info =
|
|
builder->create_flags &
|
|
VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
|
|
|
|
bool retain_nir =
|
|
builder->create_flags &
|
|
VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
|
|
|
|
int64_t pipeline_start = os_time_get_nano();
|
|
|
|
const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
|
|
vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
|
|
|
|
bool must_compile = false;
|
|
for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
|
|
if (!(builder->active_stages & builder->create_info->pStages[i].stage))
|
|
continue;
|
|
|
|
mesa_shader_stage stage =
|
|
vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
|
|
stage_infos[stage] = &builder->create_info->pStages[i];
|
|
must_compile = true;
|
|
}
|
|
|
|
/* Forward declare everything due to the goto usage */
|
|
nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
|
|
struct tu_shader *shaders[ARRAY_SIZE(stage_infos)] = { NULL };
|
|
nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL };
|
|
char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
|
|
bool cache_hit = false;
|
|
|
|
struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
|
|
stage < ARRAY_SIZE(keys); stage = (mesa_shader_stage) (stage+1)) {
|
|
const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info = NULL;
|
|
if (stage_infos[stage])
|
|
subgroup_info = vk_find_struct_const(stage_infos[stage],
|
|
PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
|
|
bool allow_varying_subgroup_size =
|
|
!stage_infos[stage] ||
|
|
(stage_infos[stage]->flags &
|
|
VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
|
|
bool require_full_subgroups =
|
|
stage_infos[stage] &&
|
|
(stage_infos[stage]->flags &
|
|
VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);
|
|
tu_shader_key_subgroup_size(&keys[stage], allow_varying_subgroup_size,
|
|
require_full_subgroups, subgroup_info,
|
|
builder->device);
|
|
|
|
if (stage_infos[stage]) {
|
|
struct vk_pipeline_robustness_state rs;
|
|
vk_pipeline_robustness_state_fill(&builder->device->vk, &rs,
|
|
builder->create_info->pNext,
|
|
stage_infos[stage]->pNext);
|
|
tu_shader_key_robustness(&keys[stage], &rs);
|
|
if (builder->create_flags & VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR)
|
|
keys[stage].lower_view_index_to_device_index = true;
|
|
}
|
|
}
|
|
|
|
if ((builder->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
|
|
builder->graphics_state.ial &&
|
|
builder->create_info->renderPass == VK_NULL_HANDLE) {
|
|
const struct vk_input_attachment_location_state *ial =
|
|
builder->graphics_state.ial;
|
|
|
|
keys[MESA_SHADER_FRAGMENT].dynamic_renderpass = true;
|
|
|
|
uint32_t attachments_referenced = 0;
|
|
|
|
if (ial->color_attachment_count == MESA_VK_COLOR_ATTACHMENT_COUNT_UNKNOWN) {
|
|
attachments_referenced |=
|
|
BITFIELD_MASK(MAX_RTS) << TU_DYN_INPUT_ATT_OFFSET;
|
|
} else {
|
|
for (unsigned i = 0; i < ial->color_attachment_count; i++) {
|
|
if (ial->color_map[i] != MESA_VK_ATTACHMENT_UNUSED) {
|
|
attachments_referenced |=
|
|
(1u << (ial->color_map[i] + TU_DYN_INPUT_ATT_OFFSET));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (ial->depth_att != MESA_VK_ATTACHMENT_UNUSED) {
|
|
if (ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX)
|
|
attachments_referenced |= 1;
|
|
else
|
|
attachments_referenced |= 1u << (ial->depth_att + 1);
|
|
}
|
|
|
|
if (ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED) {
|
|
if (ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX)
|
|
attachments_referenced |= 1;
|
|
else
|
|
attachments_referenced |= 1u << (ial->stencil_att + 1);
|
|
}
|
|
|
|
keys[MESA_SHADER_FRAGMENT].read_only_input_attachments =
|
|
~attachments_referenced;
|
|
}
|
|
|
|
if (builder->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
|
|
keys[MESA_SHADER_FRAGMENT].custom_resolve =
|
|
builder->graphics_state.rp->custom_resolve;
|
|
}
|
|
|
|
if (builder->create_flags &
|
|
VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
|
|
for (unsigned i = 0; i < builder->num_libraries; i++) {
|
|
struct tu_graphics_lib_pipeline *library = builder->libraries[i];
|
|
|
|
for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) {
|
|
if (library->shaders[j].nir) {
|
|
assert(!nir[j]);
|
|
nir[j] = nir_shader_clone(builder->mem_ctx,
|
|
library->shaders[j].nir);
|
|
keys[j] = library->shaders[j].key;
|
|
must_compile = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
struct tu_nir_shaders *nir_shaders = NULL;
|
|
if (!must_compile)
|
|
goto done;
|
|
|
|
if (builder->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
|
|
keys[MESA_SHADER_VERTEX].multiview_mask =
|
|
builder->graphics_state.rp->view_mask;
|
|
|
|
mesa_shader_stage last_pre_rast_stage = MESA_SHADER_VERTEX;
|
|
for (int i = MESA_SHADER_GEOMETRY; i >= MESA_SHADER_VERTEX; i--) {
|
|
if (nir[i]) {
|
|
last_pre_rast_stage = (mesa_shader_stage)i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
keys[last_pre_rast_stage].fdm_per_layer = builder->fdm_per_layer;
|
|
}
|
|
|
|
if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
|
|
keys[MESA_SHADER_FRAGMENT].multiview_mask =
|
|
builder->graphics_state.rp->view_mask;
|
|
keys[MESA_SHADER_FRAGMENT].fragment_density_map =
|
|
builder->fragment_density_map;
|
|
keys[MESA_SHADER_FRAGMENT].fdm_per_layer =
|
|
builder->fdm_per_layer;
|
|
keys[MESA_SHADER_FRAGMENT].max_fdm_layers = builder->max_fdm_layers;
|
|
keys[MESA_SHADER_FRAGMENT].unscaled_input_fragcoord =
|
|
builder->unscaled_input_fragcoord;
|
|
|
|
const VkPipelineMultisampleStateCreateInfo *msaa_info =
|
|
builder->create_info->pMultisampleState;
|
|
|
|
/* The 1.3.215 spec says:
|
|
*
|
|
* Sample shading can be used to specify a minimum number of unique
|
|
* samples to process for each fragment. If sample shading is enabled,
|
|
* an implementation must provide a minimum of
|
|
*
|
|
* max(ceil(minSampleShadingFactor * totalSamples), 1)
|
|
*
|
|
* unique associated data for each fragment, where
|
|
* minSampleShadingFactor is the minimum fraction of sample shading.
|
|
*
|
|
* The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
|
|
* They both require unique associated data.
|
|
*
|
|
* There are discussions to change the definition, such that
|
|
* sampleShadingEnable does not imply unique associated data. Before the
|
|
* discussions are settled and before apps (i.e., ANGLE) are fixed to
|
|
* follow the new and incompatible definition, we should stick to the
|
|
* current definition.
|
|
*
|
|
* Note that ir3_shader_key::sample_shading is not actually used by ir3,
|
|
* just checked in tu6_emit_fs_inputs. We will also copy the value to
|
|
* tu_shader_key::force_sample_interp in a bit.
|
|
*/
|
|
keys[MESA_SHADER_FRAGMENT].force_sample_interp =
|
|
!builder->rasterizer_discard && msaa_info && msaa_info->sampleShadingEnable;
|
|
}
|
|
|
|
unsigned char pipeline_sha1[20];
|
|
tu_hash_shaders(pipeline_sha1, builder->create_flags, stage_infos, nir,
|
|
&builder->layout, keys, builder->state);
|
|
|
|
unsigned char nir_sha1[21];
|
|
memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1));
|
|
nir_sha1[20] = 'N';
|
|
|
|
if (!executable_info) {
|
|
cache_hit = true;
|
|
bool application_cache_hit = false;
|
|
|
|
unsigned char shader_sha1[21];
|
|
memcpy(shader_sha1, pipeline_sha1, sizeof(pipeline_sha1));
|
|
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
|
|
stage = (mesa_shader_stage) (stage + 1)) {
|
|
if (stage_infos[stage] || nir[stage]) {
|
|
bool shader_application_cache_hit;
|
|
shader_sha1[20] = (unsigned char) stage;
|
|
shaders[stage] =
|
|
tu_pipeline_cache_lookup(builder->cache, &shader_sha1,
|
|
sizeof(shader_sha1),
|
|
&shader_application_cache_hit);
|
|
if (!shaders[stage]) {
|
|
cache_hit = false;
|
|
break;
|
|
}
|
|
application_cache_hit &= shader_application_cache_hit;
|
|
}
|
|
}
|
|
|
|
/* If the user asks us to keep the NIR around, we need to have it for a
|
|
* successful cache hit. If we only have a "partial" cache hit, then we
|
|
* still need to recompile in order to get the NIR.
|
|
*/
|
|
if (cache_hit &&
|
|
(builder->create_flags &
|
|
VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) {
|
|
bool nir_application_cache_hit = false;
|
|
nir_shaders =
|
|
tu_nir_cache_lookup(builder->cache, &nir_sha1,
|
|
sizeof(nir_sha1),
|
|
&nir_application_cache_hit);
|
|
|
|
application_cache_hit &= nir_application_cache_hit;
|
|
cache_hit &= !!nir_shaders;
|
|
}
|
|
|
|
if (application_cache_hit && builder->cache != builder->device->mem_cache) {
|
|
pipeline_feedback.flags |=
|
|
VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
|
|
}
|
|
}
|
|
|
|
if (!cache_hit) {
|
|
if (builder->create_flags &
|
|
VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
|
|
return VK_PIPELINE_COMPILE_REQUIRED;
|
|
}
|
|
|
|
result = tu_compile_shaders(builder->device,
|
|
builder->create_flags,
|
|
stage_infos,
|
|
nir,
|
|
keys,
|
|
&builder->layout,
|
|
pipeline_sha1,
|
|
shaders,
|
|
executable_info ? nir_initial_disasm : NULL,
|
|
pipeline->executables_mem_ctx,
|
|
retain_nir ? post_link_nir : NULL,
|
|
stage_feedbacks);
|
|
|
|
if (result != VK_SUCCESS)
|
|
goto fail;
|
|
|
|
if (retain_nir) {
|
|
nir_shaders =
|
|
tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1));
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
|
|
stage < ARRAY_SIZE(nir); stage = (mesa_shader_stage) (stage + 1)) {
|
|
if (!post_link_nir[stage])
|
|
continue;
|
|
|
|
nir_shaders->nir[stage] = post_link_nir[stage];
|
|
}
|
|
|
|
nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders);
|
|
}
|
|
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
|
|
stage = (mesa_shader_stage) (stage + 1)) {
|
|
if (!nir[stage])
|
|
continue;
|
|
|
|
shaders[stage] = tu_pipeline_cache_insert(builder->cache, shaders[stage]);
|
|
}
|
|
}
|
|
|
|
done:
|
|
|
|
/* Create empty shaders which contain the draw states to initialize
|
|
* registers for unused shader stages.
|
|
*/
|
|
if (builder->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
|
|
if (!shaders[MESA_SHADER_TESS_CTRL]) {
|
|
shaders[MESA_SHADER_TESS_CTRL] = builder->device->empty_tcs;
|
|
vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_CTRL]->base);
|
|
}
|
|
if (!shaders[MESA_SHADER_TESS_EVAL]) {
|
|
shaders[MESA_SHADER_TESS_EVAL] = builder->device->empty_tes;
|
|
vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_EVAL]->base);
|
|
}
|
|
if (!shaders[MESA_SHADER_GEOMETRY]) {
|
|
shaders[MESA_SHADER_GEOMETRY] = builder->device->empty_gs;
|
|
vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_GEOMETRY]->base);
|
|
}
|
|
}
|
|
|
|
if (builder->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
|
|
if (!shaders[MESA_SHADER_FRAGMENT]) {
|
|
shaders[MESA_SHADER_FRAGMENT] =
|
|
builder->fragment_density_map ?
|
|
builder->device->empty_fs_fdm : builder->device->empty_fs;
|
|
vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_FRAGMENT]->base);
|
|
}
|
|
}
|
|
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
|
|
stage < ARRAY_SIZE(nir); stage = (mesa_shader_stage) (stage + 1)) {
|
|
if (shaders[stage] && shaders[stage]->variant) {
|
|
tu_append_executable(pipeline, shaders[stage]->variant,
|
|
nir_initial_disasm[stage]);
|
|
}
|
|
}
|
|
|
|
/* We may have deduplicated a cache entry, in which case our original
|
|
* post_link_nir may be gone.
|
|
*/
|
|
if (nir_shaders) {
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
|
|
stage < ARRAY_SIZE(nir); stage = (mesa_shader_stage) (stage + 1)) {
|
|
if (nir_shaders->nir[stage]) {
|
|
post_link_nir[stage] = nir_shaders->nir[stage];
|
|
}
|
|
}
|
|
}
|
|
|
|
/* In the case where we're building a library without link-time
|
|
* optimization but with sub-libraries that retain LTO info, we should
|
|
* retain it ourselves in case another pipeline includes us with LTO.
|
|
*/
|
|
for (unsigned i = 0; i < builder->num_libraries; i++) {
|
|
struct tu_graphics_lib_pipeline *library = builder->libraries[i];
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
|
|
stage < ARRAY_SIZE(library->shaders);
|
|
stage = (mesa_shader_stage) (stage + 1)) {
|
|
if (!post_link_nir[stage] && library->shaders[stage].nir) {
|
|
post_link_nir[stage] = library->shaders[stage].nir;
|
|
keys[stage] = library->shaders[stage].key;
|
|
}
|
|
|
|
if (!shaders[stage] && library->base.shaders[stage]) {
|
|
shaders[stage] = library->base.shaders[stage];
|
|
vk_pipeline_cache_object_ref(&shaders[stage]->base);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (shaders[MESA_SHADER_VERTEX]) {
|
|
const struct ir3_shader_variant *vs =
|
|
shaders[MESA_SHADER_VERTEX]->variant;
|
|
|
|
if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
|
|
tu_append_executable(pipeline, vs->binning, NULL);
|
|
}
|
|
}
|
|
|
|
if (pipeline_contains_all_shader_state(pipeline)) {
|
|
/* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO
|
|
* when compiling all stages, but make sure we don't leak.
|
|
*/
|
|
if (nir_shaders)
|
|
vk_pipeline_cache_object_unref(&builder->device->vk,
|
|
&nir_shaders->base);
|
|
} else {
|
|
struct tu_graphics_lib_pipeline *library =
|
|
tu_pipeline_to_graphics_lib(pipeline);
|
|
library->nir_shaders = nir_shaders;
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
|
|
stage < ARRAY_SIZE(library->shaders);
|
|
stage = (mesa_shader_stage) (stage + 1)) {
|
|
library->shaders[stage].nir = post_link_nir[stage];
|
|
library->shaders[stage].key = keys[stage];
|
|
}
|
|
}
|
|
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
|
|
stage < ARRAY_SIZE(shaders); stage = (mesa_shader_stage) (stage + 1)) {
|
|
pipeline->shaders[stage] = shaders[stage];
|
|
if (shaders[stage])
|
|
pipeline->active_desc_sets |= shaders[stage]->active_desc_sets;
|
|
}
|
|
|
|
pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
|
|
if (creation_feedback) {
|
|
*creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
|
|
|
|
for (uint32_t i = 0; i < creation_feedback->pipelineStageCreationFeedbackCount; i++) {
|
|
mesa_shader_stage s =
|
|
vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
|
|
creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
|
|
}
|
|
}
|
|
|
|
return VK_SUCCESS;
|
|
|
|
fail:
|
|
if (nir_shaders)
|
|
vk_pipeline_cache_object_unref(&builder->device->vk,
|
|
&nir_shaders->base);
|
|
|
|
return result;
|
|
}
|
|
|
|
static void
|
|
tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder,
|
|
struct tu_pipeline *pipeline)
|
|
{
|
|
const VkPipelineLibraryCreateInfoKHR *library_info =
|
|
vk_find_struct_const(builder->create_info->pNext,
|
|
PIPELINE_LIBRARY_CREATE_INFO_KHR);
|
|
|
|
if (library_info) {
|
|
assert(library_info->libraryCount <= MAX_LIBRARIES);
|
|
builder->num_libraries = library_info->libraryCount;
|
|
for (unsigned i = 0; i < library_info->libraryCount; i++) {
|
|
VK_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]);
|
|
builder->libraries[i] = tu_pipeline_to_graphics_lib(library);
|
|
}
|
|
}
|
|
|
|
/* Merge in the state from libraries. The program state is a bit special
|
|
* and is handled separately.
|
|
*/
|
|
if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
|
|
tu_pipeline_to_graphics_lib(pipeline)->state = builder->state;
|
|
for (unsigned i = 0; i < builder->num_libraries; i++) {
|
|
struct tu_graphics_lib_pipeline *library = builder->libraries[i];
|
|
if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
|
|
tu_pipeline_to_graphics_lib(pipeline)->state |= library->state;
|
|
|
|
if (library->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
|
|
pipeline->output = library->base.output;
|
|
pipeline->lrz_blend.lrz_blend_status =
|
|
library->base.lrz_blend.lrz_blend_status;
|
|
pipeline->lrz_blend.valid |= library->base.lrz_blend.valid;
|
|
}
|
|
|
|
if ((library->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
|
|
(library->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
|
|
pipeline->prim_order = library->base.prim_order;
|
|
}
|
|
|
|
if (library->base.bandwidth.valid)
|
|
pipeline->bandwidth = library->base.bandwidth;
|
|
|
|
if (library->base.disable_fs.valid)
|
|
pipeline->disable_fs = library->base.disable_fs;
|
|
|
|
pipeline->set_state_mask |= library->base.set_state_mask;
|
|
|
|
u_foreach_bit (i, library->base.set_state_mask) {
|
|
pipeline->dynamic_state[i] = library->base.dynamic_state[i];
|
|
}
|
|
|
|
if (contains_all_shader_state(library->state)) {
|
|
pipeline->program = library->base.program;
|
|
pipeline->load_state = library->base.load_state;
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
|
|
if (library->base.shaders[i]) {
|
|
pipeline->shaders[i] = library->base.shaders[i];
|
|
vk_pipeline_cache_object_ref(&pipeline->shaders[i]->base);
|
|
}
|
|
}
|
|
}
|
|
|
|
BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask,
|
|
library->base.static_state_mask);
|
|
|
|
vk_graphics_pipeline_state_merge(&builder->graphics_state,
|
|
&library->graphics_state);
|
|
}
|
|
}
|
|
|
|
static void
|
|
tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
|
|
struct tu_pipeline *pipeline)
|
|
{
|
|
VK_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout);
|
|
|
|
if (layout) {
|
|
/* Note: it's still valid to have a layout even if there are libraries.
|
|
* This allows the app to e.g. overwrite an INDEPENDENT_SET layout with
|
|
* a non-INDEPENDENT_SET layout which may make us use a faster path,
|
|
* currently this just affects dynamic offset descriptors.
|
|
*/
|
|
builder->layout = *layout;
|
|
} else {
|
|
for (unsigned i = 0; i < builder->num_libraries; i++) {
|
|
struct tu_graphics_lib_pipeline *library = builder->libraries[i];
|
|
builder->layout.num_sets = MAX2(builder->layout.num_sets,
|
|
library->num_sets);
|
|
assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets);
|
|
for (unsigned j = 0; j < library->num_sets; j++) {
|
|
builder->layout.set[i].layout = library->layouts[i];
|
|
}
|
|
|
|
builder->layout.push_constant_size = library->push_constant_size;
|
|
}
|
|
|
|
tu_pipeline_layout_init(&builder->layout);
|
|
}
|
|
|
|
if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
|
|
struct tu_graphics_lib_pipeline *library =
|
|
tu_pipeline_to_graphics_lib(pipeline);
|
|
library->num_sets = builder->layout.num_sets;
|
|
for (unsigned i = 0; i < library->num_sets; i++) {
|
|
library->layouts[i] = builder->layout.set[i].layout;
|
|
if (library->layouts[i])
|
|
vk_descriptor_set_layout_ref(&library->layouts[i]->vk);
|
|
}
|
|
library->push_constant_size = builder->layout.push_constant_size;
|
|
}
|
|
}
|
|
|
|
static void
|
|
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
|
|
struct tu_const_state *const_state,
|
|
const struct ir3_shader_variant *v)
|
|
{
|
|
link->const_state = *ir3_const_state(v);
|
|
link->tu_const_state = *const_state;
|
|
link->constlen = v->constlen;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu_emit_program_state(struct tu_cs *sub_cs,
|
|
struct tu_program_state *prog,
|
|
struct tu_shader **shaders)
|
|
{
|
|
struct tu_device *dev = sub_cs->device;
|
|
struct tu_cs prog_cs;
|
|
|
|
const struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
|
|
struct tu_draw_state draw_states[MESA_SHADER_STAGES];
|
|
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
|
|
stage < ARRAY_SIZE(variants); stage = (mesa_shader_stage) (stage+1)) {
|
|
variants[stage] = shaders[stage] ? shaders[stage]->variant : NULL;
|
|
}
|
|
|
|
uint32_t safe_variants =
|
|
ir3_trim_constlen(variants, dev->compiler);
|
|
|
|
unsigned dynamic_descriptor_sizes[MAX_SETS] = { };
|
|
|
|
for (mesa_shader_stage stage = MESA_SHADER_VERTEX;
|
|
stage < ARRAY_SIZE(variants); stage = (mesa_shader_stage) (stage+1)) {
|
|
if (shaders[stage]) {
|
|
if (safe_variants & (1u << stage)) {
|
|
variants[stage] = shaders[stage]->safe_const_variant;
|
|
draw_states[stage] = shaders[stage]->safe_const_state;
|
|
} else {
|
|
draw_states[stage] = shaders[stage]->state;
|
|
}
|
|
|
|
for (unsigned i = 0; i < MAX_SETS; i++) {
|
|
if (shaders[stage]->dynamic_descriptor_sizes[i] >= 0) {
|
|
dynamic_descriptor_sizes[i] =
|
|
shaders[stage]->dynamic_descriptor_sizes[i];
|
|
}
|
|
}
|
|
|
|
if (variants[stage]) {
|
|
memcpy(prog->stage_sha1[stage], variants[stage]->sha1_str,
|
|
sizeof(variants[stage]->sha1_str));
|
|
}
|
|
}
|
|
}
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(variants); i++) {
|
|
if (!variants[i])
|
|
continue;
|
|
|
|
tu_pipeline_set_linkage(&prog->link[i],
|
|
&shaders[i]->const_state,
|
|
variants[i]);
|
|
|
|
struct tu_push_constant_range *push_consts =
|
|
&shaders[i]->const_state.push_consts;
|
|
if (push_consts->type == IR3_PUSH_CONSTS_SHARED ||
|
|
push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
|
|
prog->shared_consts = *push_consts;
|
|
}
|
|
|
|
if (variants[i]->info.uses_ray_intersection)
|
|
prog->uses_ray_intersection = true;
|
|
}
|
|
|
|
unsigned dynamic_descriptor_offset = 0;
|
|
for (unsigned i = 0; i < MAX_SETS; i++) {
|
|
prog->dynamic_descriptor_offsets[i] = dynamic_descriptor_offset;
|
|
dynamic_descriptor_offset += dynamic_descriptor_sizes[i];
|
|
}
|
|
|
|
/* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
|
|
* else that could depend on that state (like push constants)
|
|
*
|
|
* Note also that this always uses the full VS even in binning pass. The
|
|
* binning pass variant has the same const layout as the full VS, and
|
|
* the constlen for the VS will be the same or greater than the constlen
|
|
* for the binning pass variant. It is required that the constlen state
|
|
* matches between binning and draw passes, as some parts of the push
|
|
* consts are emitted in state groups that are shared between the binning
|
|
* and draw passes.
|
|
*/
|
|
tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
|
|
tu6_emit_program_config<CHIP>(&prog_cs, prog, shaders, variants);
|
|
prog->config_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
|
|
|
|
prog->vs_state = draw_states[MESA_SHADER_VERTEX];
|
|
|
|
/* Don't use the binning pass variant when GS is present because we don't
|
|
* support compiling correct binning pass variants with GS.
|
|
*/
|
|
if (variants[MESA_SHADER_GEOMETRY]) {
|
|
prog->vs_binning_state = prog->vs_state;
|
|
} else {
|
|
prog->vs_binning_state =
|
|
(safe_variants & (1u << MESA_SHADER_VERTEX))
|
|
? shaders[MESA_SHADER_VERTEX]->safe_const_binning_state
|
|
: shaders[MESA_SHADER_VERTEX]->binning_state;
|
|
}
|
|
|
|
prog->hs_state = draw_states[MESA_SHADER_TESS_CTRL];
|
|
prog->ds_state = draw_states[MESA_SHADER_TESS_EVAL];
|
|
prog->gs_state = draw_states[MESA_SHADER_GEOMETRY];
|
|
prog->gs_binning_state =
|
|
(safe_variants & (1u << MESA_SHADER_GEOMETRY)) ?
|
|
shaders[MESA_SHADER_GEOMETRY]->safe_const_binning_state :
|
|
shaders[MESA_SHADER_GEOMETRY]->binning_state;
|
|
prog->fs_state = draw_states[MESA_SHADER_FRAGMENT];
|
|
|
|
const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
|
|
const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
|
|
const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
|
|
const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
|
|
const struct ir3_shader_variant *fs = variants[MESA_SHADER_FRAGMENT];
|
|
|
|
tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
|
|
tu6_emit_vpc<CHIP>(&prog_cs, vs, hs, ds, gs, fs);
|
|
prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
|
|
|
|
const struct ir3_shader_variant *last_variant;
|
|
const struct tu_shader *last_shader;
|
|
if (gs) {
|
|
last_shader = shaders[MESA_SHADER_GEOMETRY];
|
|
last_variant = gs;
|
|
} else if (ds) {
|
|
last_shader = shaders[MESA_SHADER_TESS_EVAL];
|
|
last_variant = ds;
|
|
} else {
|
|
last_shader = shaders[MESA_SHADER_VERTEX];
|
|
last_variant = vs;
|
|
}
|
|
|
|
prog->per_view_viewport =
|
|
!last_variant->writes_viewport &&
|
|
shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm &&
|
|
dev->physical_device->info->props.has_per_view_viewport;
|
|
prog->per_layer_viewport = last_shader->per_layer_viewport;
|
|
prog->fake_single_viewport = prog->per_view_viewport ||
|
|
prog->per_layer_viewport;
|
|
prog->writes_shading_rate = last_variant->writes_shading_rate;
|
|
prog->reads_shading_rate = fs->reads_shading_rate;
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_vertex_input_state[] = {
|
|
MESA_VK_DYNAMIC_VI,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_vertex_input_size(struct tu_device *dev,
|
|
const struct vk_vertex_input_state *vi)
|
|
{
|
|
return 1 + 2 * util_last_bit(vi->attributes_valid);
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_vertex_input(struct tu_cs *cs,
|
|
const struct vk_vertex_input_state *vi)
|
|
{
|
|
unsigned attr_count = util_last_bit(vi->attributes_valid);
|
|
if (attr_count != 0)
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_VFD_FETCH_INSTR_INSTR(0), attr_count * 2);
|
|
|
|
for (uint32_t loc = 0; loc < attr_count; loc++) {
|
|
const struct vk_vertex_attribute_state *attr = &vi->attributes[loc];
|
|
|
|
if (vi->attributes_valid & (1u << loc)) {
|
|
const struct vk_vertex_binding_state *binding =
|
|
&vi->bindings[attr->binding];
|
|
|
|
enum pipe_format pipe_format = vk_format_to_pipe_format(attr->format);
|
|
const struct tu_native_format format = tu6_format_vtx(pipe_format);
|
|
tu_cs_emit(cs, A6XX_VFD_FETCH_INSTR_INSTR(0,
|
|
.idx = attr->binding,
|
|
.offset = attr->offset,
|
|
.instanced = binding->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
|
|
.format = format.fmt,
|
|
.swap = format.swap,
|
|
.unk30 = 1,
|
|
._float = !util_format_is_pure_integer(pipe_format)).value);
|
|
tu_cs_emit(cs, A6XX_VFD_FETCH_INSTR_STEP_RATE(0, binding->divisor).value);
|
|
} else {
|
|
tu_cs_emit(cs, 0);
|
|
tu_cs_emit(cs, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_vertex_stride_state[] = {
|
|
MESA_VK_DYNAMIC_VI_BINDINGS_VALID,
|
|
MESA_VK_DYNAMIC_VI_BINDING_STRIDES,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_vertex_stride_size(struct tu_device *dev,
|
|
const struct vk_vertex_input_state *vi)
|
|
{
|
|
return 1 + 2 * util_last_bit(vi->bindings_valid);
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_vertex_stride(struct tu_cs *cs, const struct vk_vertex_input_state *vi)
|
|
{
|
|
if (vi->bindings_valid) {
|
|
unsigned bindings_count = util_last_bit(vi->bindings_valid);
|
|
tu_crb crb = cs->crb(bindings_count);
|
|
for (unsigned i = 0; i < bindings_count; i++) {
|
|
crb.add(A6XX_VFD_VERTEX_BUFFER_STRIDE(
|
|
i, .vfd_vertex_buffer_stride = vi->bindings[i].stride));
|
|
}
|
|
}
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_vertex_stride_size_dyn(struct tu_device *dev,
|
|
const uint16_t *vi_binding_stride,
|
|
uint32_t bindings_valid)
|
|
{
|
|
return 1 + 2 * util_last_bit(bindings_valid);
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_vertex_stride_dyn(struct tu_cs *cs, const uint16_t *vi_binding_stride,
|
|
uint32_t bindings_valid)
|
|
{
|
|
if (bindings_valid) {
|
|
unsigned bindings_count = util_last_bit(bindings_valid);
|
|
tu_crb crb = cs->crb(bindings_count);
|
|
for (unsigned i = 0; i < bindings_count; i++) {
|
|
crb.add(A6XX_VFD_VERTEX_BUFFER_STRIDE(
|
|
i, .vfd_vertex_buffer_stride = vi_binding_stride[i]));
|
|
}
|
|
}
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_viewport_state[] = {
|
|
MESA_VK_DYNAMIC_VP_VIEWPORTS,
|
|
MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT,
|
|
MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
|
|
MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_viewport_size(struct tu_device *dev,
|
|
const struct vk_viewport_state *vp,
|
|
const struct vk_rasterization_state *rs)
|
|
{
|
|
return 1 + vp->viewport_count * 6 + 1 + vp->viewport_count * 2 +
|
|
1 + vp->viewport_count * 2 + 5;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_viewport(struct tu_cs *cs,
|
|
const struct vk_viewport_state *vp,
|
|
const struct vk_rasterization_state *rs)
|
|
{
|
|
VkExtent2D guardband = {511, 511};
|
|
|
|
tu_cs_emit_pkt4(cs, GRAS_CL_VIEWPORT_XOFFSET(CHIP, 0).reg, vp->viewport_count * 6);
|
|
for (uint32_t i = 0; i < vp->viewport_count; i++) {
|
|
const VkViewport *viewport = &vp->viewports[i];
|
|
float offsets[3];
|
|
float scales[3];
|
|
scales[0] = viewport->width / 2.0f;
|
|
scales[1] = viewport->height / 2.0f;
|
|
if (vp->depth_clip_negative_one_to_one) {
|
|
scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
|
|
} else {
|
|
scales[2] = viewport->maxDepth - viewport->minDepth;
|
|
}
|
|
|
|
offsets[0] = viewport->x + scales[0];
|
|
offsets[1] = viewport->y + scales[1];
|
|
if (vp->depth_clip_negative_one_to_one) {
|
|
offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
|
|
} else {
|
|
offsets[2] = viewport->minDepth;
|
|
}
|
|
|
|
for (uint32_t j = 0; j < 3; j++) {
|
|
tu_cs_emit(cs, fui(offsets[j]));
|
|
tu_cs_emit(cs, fui(scales[j]));
|
|
}
|
|
|
|
guardband.width =
|
|
MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
|
|
guardband.height =
|
|
MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
|
|
}
|
|
|
|
tu_cs_emit_pkt4(cs, GRAS_SC_VIEWPORT_SCISSOR_TL(CHIP, 0).reg, vp->viewport_count * 2);
|
|
for (uint32_t i = 0; i < vp->viewport_count; i++) {
|
|
const VkViewport *viewport = &vp->viewports[i];
|
|
VkOffset2D min;
|
|
VkOffset2D max;
|
|
min.x = (int32_t) viewport->x;
|
|
max.x = (int32_t) ceilf(viewport->x + viewport->width);
|
|
if (viewport->height >= 0.0f) {
|
|
min.y = (int32_t) viewport->y;
|
|
max.y = (int32_t) ceilf(viewport->y + viewport->height);
|
|
} else {
|
|
min.y = (int32_t)(viewport->y + viewport->height);
|
|
max.y = (int32_t) ceilf(viewport->y);
|
|
}
|
|
/* the spec allows viewport->height to be 0.0f */
|
|
if (min.y == max.y)
|
|
max.y++;
|
|
/* allow viewport->width = 0.0f for un-initialized viewports: */
|
|
if (min.x == max.x)
|
|
max.x++;
|
|
|
|
min.x = MAX2(min.x, 0);
|
|
min.y = MAX2(min.y, 0);
|
|
max.x = MAX2(max.x, 1);
|
|
max.y = MAX2(max.y, 1);
|
|
|
|
assert(min.x < max.x);
|
|
assert(min.y < max.y);
|
|
|
|
tu_cs_emit(
|
|
cs, GRAS_SC_VIEWPORT_SCISSOR_TL(CHIP, 0, .x = min.x, .y = min.y).value);
|
|
tu_cs_emit(
|
|
cs, GRAS_SC_VIEWPORT_SCISSOR_BR(CHIP, 0, .x = max.x - 1, .y = max.y - 1)
|
|
.value);
|
|
}
|
|
|
|
/* A7XX+ doesn't clamp to [0,1] with disabled depth clamp, to support
|
|
* VK_EXT_depth_clamp_zero_one we have to always enable clamp and manually
|
|
* set range to [0,1] when rs->depth_clamp_enable is false.
|
|
*/
|
|
bool zero_one_depth_clamp = CHIP >= A7XX && !rs->depth_clamp_enable;
|
|
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VIEWPORT_ZCLAMP(0), vp->viewport_count * 2);
|
|
for (uint32_t i = 0; i < vp->viewport_count; i++) {
|
|
const VkViewport *viewport = &vp->viewports[i];
|
|
if (zero_one_depth_clamp) {
|
|
tu_cs_emit(cs, fui(0.0f));
|
|
tu_cs_emit(cs, fui(1.0f));
|
|
} else {
|
|
tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
|
|
tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
|
|
}
|
|
}
|
|
tu_cs_emit_regs(cs,
|
|
GRAS_CL_GUARDBAND_CLIP_ADJ(CHIP, .horz = guardband.width, .vert = guardband.height));
|
|
|
|
/* TODO: what to do about this and multi viewport ? */
|
|
float z_clamp_min = vp->viewport_count ? MIN2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
|
|
float z_clamp_max = vp->viewport_count ? MAX2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
|
|
if (zero_one_depth_clamp) {
|
|
z_clamp_min = 0.0f;
|
|
z_clamp_max = 1.0f;
|
|
}
|
|
|
|
tu_cs_emit_regs(cs,
|
|
RB_VIEWPORT_ZCLAMP_MIN(CHIP, z_clamp_min),
|
|
RB_VIEWPORT_ZCLAMP_MAX(CHIP, z_clamp_max));
|
|
}
|
|
|
|
struct apply_viewport_state {
|
|
struct vk_viewport_state vp;
|
|
struct vk_rasterization_state rs;
|
|
/* See tu_render_pass_state::shared_viewport */
|
|
bool share_scale;
|
|
/* See tu_pipeline::fake_single_viewport */
|
|
bool fake_single_viewport;
|
|
bool custom_resolve;
|
|
};
|
|
|
|
/* It's a hardware restriction that the window offset (i.e. common_bin_offset)
|
|
* must be the same for all views. This means that rendering coordinates
|
|
* cannot be a simple scaling of framebuffer coordinates, because this would
|
|
* require us to scale the window offset and the scale may be different per
|
|
* view. Instead we have to apply a per-bin offset to the rendering coordinate
|
|
* transform to make sure that the window offset maps to the per-view bin
|
|
* coordinate, which will be the same if there is no offset. Specifically we
|
|
* need an offset o to the transform:
|
|
*
|
|
* x' = s * x + o
|
|
*
|
|
* so that when we plug in the per-view bin start b_s and the common window
|
|
* offset b_cs:
|
|
*
|
|
* b_cs = s * b_s + o
|
|
*
|
|
* and we get:
|
|
*
|
|
* o = b_cs - s * b_s
|
|
*
|
|
* We use this form exactly, because we know the bin start is a multiple of
|
|
* the frag area so s * b_s is an integer and we can compute an exact result
|
|
* easily. We also have to make sure that the bin offset is a multiple of the
|
|
* frag area by restricting the frag area.
|
|
*/
|
|
|
|
VkOffset2D
|
|
tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin,
|
|
VkOffset2D common_bin_offset)
|
|
{
|
|
assert(bin.offset.x % frag_area.width == 0);
|
|
assert(bin.offset.y % frag_area.height == 0);
|
|
|
|
return (VkOffset2D) {
|
|
common_bin_offset.x - bin.offset.x / frag_area.width,
|
|
common_bin_offset.y - bin.offset.y / frag_area.height
|
|
};
|
|
}
|
|
|
|
static void
|
|
fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
|
|
VkOffset2D common_bin_offset,
|
|
const VkOffset2D *hw_viewport_offsets,
|
|
unsigned views,
|
|
const VkExtent2D *frag_areas, const VkRect2D *bins,
|
|
bool binning)
|
|
{
|
|
const struct apply_viewport_state *state =
|
|
(const struct apply_viewport_state *)data;
|
|
|
|
struct vk_viewport_state vp = state->vp;
|
|
|
|
for (unsigned i = 0; i < state->vp.viewport_count; i++) {
|
|
/* Note: If we're using shared scaling, the scale should already be the
|
|
* same across all views, we can pick any view. However the number
|
|
* of viewports and number of views is not guaranteed the same, so we
|
|
* need to pick the 0'th view which always exists to be safe.
|
|
*
|
|
* If FDM per layer is enabled in the shader but disabled by the
|
|
* renderpass, views will be 1 and we also have to replicate the 0'th
|
|
* view to every view.
|
|
*/
|
|
VkExtent2D frag_area =
|
|
(state->share_scale || views == 1) ? frag_areas[0] : frag_areas[i];
|
|
VkRect2D bin =
|
|
(state->share_scale || views == 1) ? bins[0] : bins[i];
|
|
VkOffset2D hw_viewport_offset =
|
|
(state->share_scale || views == 1) ? hw_viewport_offsets[0] :
|
|
hw_viewport_offsets[i];
|
|
/* Implement fake_single_viewport by replicating viewport 0 across all
|
|
* views.
|
|
*/
|
|
VkViewport viewport =
|
|
state->fake_single_viewport ? state->vp.viewports[0] : state->vp.viewports[i];
|
|
if ((frag_area.width == 1 && frag_area.height == 1 &&
|
|
common_bin_offset.x == bin.offset.x &&
|
|
common_bin_offset.y == bin.offset.y) ||
|
|
/* When in a custom resolve operation (TODO: and using
|
|
* non-subsampled images) we switch to framebuffer coordinates so we
|
|
* shouldn't apply the transform. However the binning pass isn't
|
|
* aware of this, so we have to keep applying the transform for
|
|
* binning.
|
|
*/
|
|
(state->custom_resolve && !binning)) {
|
|
vp.viewports[i] = viewport;
|
|
continue;
|
|
}
|
|
|
|
float scale_x = (float) 1.0f / frag_area.width;
|
|
float scale_y = (float) 1.0f / frag_area.height;
|
|
|
|
vp.viewports[i].minDepth = viewport.minDepth;
|
|
vp.viewports[i].maxDepth = viewport.maxDepth;
|
|
vp.viewports[i].width = viewport.width * scale_x;
|
|
vp.viewports[i].height = viewport.height * scale_y;
|
|
|
|
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
|
|
common_bin_offset);
|
|
offset.x -= hw_viewport_offset.x;
|
|
offset.y -= hw_viewport_offset.y;
|
|
|
|
vp.viewports[i].x = scale_x * viewport.x + offset.x;
|
|
vp.viewports[i].y = scale_y * viewport.y + offset.y;
|
|
}
|
|
|
|
TU_CALLX(cs->device, tu6_emit_viewport)(cs, &vp, &state->rs);
|
|
}
|
|
|
|
static void
|
|
tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
|
|
const struct vk_viewport_state *vp,
|
|
const struct vk_rasterization_state *rs)
|
|
{
|
|
unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
|
|
struct apply_viewport_state state = {
|
|
.vp = *vp,
|
|
.rs = *rs,
|
|
.share_scale = !cmd->state.per_view_viewport &&
|
|
!cmd->state.per_layer_viewport,
|
|
.fake_single_viewport = cmd->state.fake_single_viewport,
|
|
.custom_resolve = cmd->state.subpass->custom_resolve,
|
|
};
|
|
if (cmd->state.per_view_viewport)
|
|
state.vp.viewport_count = num_views;
|
|
else if (cmd->state.per_layer_viewport)
|
|
state.vp.viewport_count = cmd->state.max_fdm_layers;
|
|
unsigned size = TU_CALLX(cmd->device, tu6_viewport_size)(cmd->device, &state.vp, &state.rs);
|
|
tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
|
|
tu_create_fdm_bin_patchpoint(cmd, cs, size, TU_FDM_NONE,
|
|
fdm_apply_viewports, state);
|
|
cmd->state.rp.shared_viewport |= !cmd->state.per_view_viewport &&
|
|
!cmd->state.program.per_layer_viewport;
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_scissor_state[] = {
|
|
MESA_VK_DYNAMIC_VP_SCISSORS,
|
|
MESA_VK_DYNAMIC_VP_SCISSOR_COUNT,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_scissor_size(struct tu_device *dev, const struct vk_viewport_state *vp)
|
|
{
|
|
return 1 + vp->scissor_count * 2;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
void
|
|
tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)
|
|
{
|
|
tu_cs_emit_pkt4(cs, GRAS_SC_SCREEN_SCISSOR_TL(CHIP, 0).reg, vp->scissor_count * 2);
|
|
|
|
for (uint32_t i = 0; i < vp->scissor_count; i++) {
|
|
const VkRect2D *scissor = &vp->scissors[i];
|
|
|
|
uint32_t min_x = scissor->offset.x;
|
|
uint32_t min_y = scissor->offset.y;
|
|
uint32_t max_x = min_x + scissor->extent.width - 1;
|
|
uint32_t max_y = min_y + scissor->extent.height - 1;
|
|
|
|
if (!scissor->extent.width || !scissor->extent.height) {
|
|
min_x = min_y = 1;
|
|
max_x = max_y = 0;
|
|
} else {
|
|
/* avoid overflow */
|
|
uint32_t scissor_max = BITFIELD_MASK(15);
|
|
min_x = MIN2(scissor_max, min_x);
|
|
min_y = MIN2(scissor_max, min_y);
|
|
max_x = MIN2(scissor_max, max_x);
|
|
max_y = MIN2(scissor_max, max_y);
|
|
}
|
|
|
|
tu_cs_emit(cs, GRAS_SC_SCREEN_SCISSOR_TL(CHIP, i, .x = min_x, .y = min_y).value);
|
|
tu_cs_emit(cs, GRAS_SC_SCREEN_SCISSOR_BR(CHIP, i, .x = max_x, .y = max_y).value);
|
|
}
|
|
}
|
|
|
|
static void
|
|
fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
|
|
VkOffset2D common_bin_offset,
|
|
const VkOffset2D *hw_viewport_offsets,
|
|
unsigned views,
|
|
const VkExtent2D *frag_areas, const VkRect2D *bins,
|
|
bool binning)
|
|
{
|
|
const struct apply_viewport_state *state =
|
|
(const struct apply_viewport_state *)data;
|
|
|
|
struct vk_viewport_state vp = state->vp;
|
|
|
|
for (unsigned i = 0; i < vp.scissor_count; i++) {
|
|
VkExtent2D frag_area =
|
|
(state->share_scale || views == 1) ? frag_areas[0] : frag_areas[i];
|
|
VkRect2D bin =
|
|
(state->share_scale || views == 1) ? bins[0] : bins[i];
|
|
VkRect2D scissor =
|
|
state->fake_single_viewport ? state->vp.scissors[0] : state->vp.scissors[i];
|
|
VkOffset2D hw_viewport_offset =
|
|
(state->share_scale || views == 1) ? hw_viewport_offsets[0] :
|
|
hw_viewport_offsets[i];
|
|
|
|
/* Transform the scissor following the viewport. It's unclear how this
|
|
* is supposed to handle cases where the scissor isn't aligned to the
|
|
* fragment area, but we round outwards to always render partial
|
|
* fragments if the scissor size equals the framebuffer size and it
|
|
* isn't aligned to the fragment area.
|
|
*/
|
|
VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin,
|
|
common_bin_offset);
|
|
offset.x -= hw_viewport_offset.x;
|
|
offset.y -= hw_viewport_offset.y;
|
|
|
|
/* Disable scaling and offset when doing a custom resolve to a
|
|
* non-subsampled image and not in the binning pass, because we
|
|
* use framebuffer coordinates.
|
|
*
|
|
* TODO: When we support subsampled images, only do this for
|
|
* non-subsampled images.
|
|
*/
|
|
if (state->custom_resolve && !binning) {
|
|
offset = (VkOffset2D) {};
|
|
frag_area = (VkExtent2D) {1, 1};
|
|
}
|
|
|
|
VkOffset2D min = {
|
|
scissor.offset.x / frag_area.width + offset.x,
|
|
scissor.offset.y / frag_area.width + offset.y,
|
|
};
|
|
VkOffset2D max = {
|
|
DIV_ROUND_UP(scissor.offset.x + scissor.extent.width, frag_area.width) + offset.x,
|
|
DIV_ROUND_UP(scissor.offset.y + scissor.extent.height, frag_area.height) + offset.y,
|
|
};
|
|
|
|
/* Intersect scissor with the scaled bin, this essentially replaces the
|
|
* window scissor. With custom resolve (TODO: and non-subsampled images)
|
|
* we have to use the unscaled bin instead.
|
|
*/
|
|
uint32_t scaled_width = bin.extent.width / frag_area.width;
|
|
uint32_t scaled_height = bin.extent.height / frag_area.height;
|
|
int32_t bin_x;
|
|
int32_t bin_y;
|
|
if (state->custom_resolve && !binning) {
|
|
bin_x = bin.offset.x;
|
|
bin_y = bin.offset.y;
|
|
} else {
|
|
bin_x = common_bin_offset.x - hw_viewport_offset.x;
|
|
bin_y = common_bin_offset.y - hw_viewport_offset.y;
|
|
}
|
|
vp.scissors[i].offset.x = MAX2(min.x, bin_x);
|
|
vp.scissors[i].offset.y = MAX2(min.y, bin_y);
|
|
vp.scissors[i].extent.width =
|
|
MIN2(max.x, bin_x + scaled_width) - vp.scissors[i].offset.x;
|
|
vp.scissors[i].extent.height =
|
|
MIN2(max.y, bin_y + scaled_height) - vp.scissors[i].offset.y;
|
|
}
|
|
|
|
TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);
|
|
}
|
|
|
|
static void
|
|
tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
|
|
const struct vk_viewport_state *vp)
|
|
{
|
|
unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
|
|
struct apply_viewport_state state = {
|
|
.vp = *vp,
|
|
.share_scale = !cmd->state.per_view_viewport &&
|
|
!cmd->state.per_layer_viewport,
|
|
.fake_single_viewport = cmd->state.fake_single_viewport,
|
|
.custom_resolve = cmd->state.subpass->custom_resolve,
|
|
};
|
|
if (cmd->state.per_view_viewport)
|
|
state.vp.scissor_count = num_views;
|
|
else if (cmd->state.per_layer_viewport)
|
|
state.vp.scissor_count = cmd->state.max_fdm_layers;
|
|
unsigned size = TU_CALLX(cmd->device, tu6_scissor_size)(cmd->device, &state.vp);
|
|
tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
|
|
tu_create_fdm_bin_patchpoint(cmd, cs, size, TU_FDM_NONE, fdm_apply_scissors,
|
|
state);
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_sample_locations_state[] = {
|
|
MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE,
|
|
MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_sample_locations_size(struct tu_device *dev, bool enable,
|
|
const struct vk_sample_locations_state *samp_loc)
|
|
{
|
|
return 6 + (enable ? 9 : 0);
|
|
}
|
|
|
|
template <chip CHIP>
|
|
void
|
|
tu6_emit_sample_locations(struct tu_cs *cs, bool enable,
|
|
const struct vk_sample_locations_state *samp_loc)
|
|
{
|
|
uint32_t sample_config =
|
|
COND(enable, A6XX_RB_MSAA_SAMPLE_POS_CNTL_LOCATION_ENABLE);
|
|
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_MSAA_SAMPLE_POS_CNTL, 1);
|
|
tu_cs_emit(cs, sample_config);
|
|
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_RB_MSAA_SAMPLE_POS_CNTL, 1);
|
|
tu_cs_emit(cs, sample_config);
|
|
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_TPL1_MSAA_SAMPLE_POS_CNTL, 1);
|
|
tu_cs_emit(cs, sample_config);
|
|
|
|
if (!enable)
|
|
return;
|
|
|
|
assert(samp_loc->grid_size.width == 1);
|
|
assert(samp_loc->grid_size.height == 1);
|
|
|
|
uint64_t sample_locations = 0;
|
|
for (uint32_t i = 0; i < samp_loc->per_pixel; i++) {
|
|
/* From VkSampleLocationEXT:
|
|
*
|
|
* The values specified in a VkSampleLocationEXT structure are always
|
|
* clamped to the implementation-dependent sample location coordinate
|
|
* range
|
|
* [sampleLocationCoordinateRange[0],sampleLocationCoordinateRange[1]]
|
|
*/
|
|
float x = CLAMP(samp_loc->locations[i].x, SAMPLE_LOCATION_MIN,
|
|
SAMPLE_LOCATION_MAX);
|
|
float y = CLAMP(samp_loc->locations[i].y, SAMPLE_LOCATION_MIN,
|
|
SAMPLE_LOCATION_MAX);
|
|
|
|
sample_locations |=
|
|
((uint64_t)(A6XX_RB_PROGRAMMABLE_MSAA_POS_0_SAMPLE_0_X(x) |
|
|
A6XX_RB_PROGRAMMABLE_MSAA_POS_0_SAMPLE_0_Y(y))) << i*8;
|
|
}
|
|
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_PROGRAMMABLE_MSAA_POS_0, 2);
|
|
tu_cs_emit_qw(cs, sample_locations);
|
|
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_RB_PROGRAMMABLE_MSAA_POS_0, 2);
|
|
tu_cs_emit_qw(cs, sample_locations);
|
|
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_TPL1_PROGRAMMABLE_MSAA_POS_0, 2);
|
|
tu_cs_emit_qw(cs, sample_locations);
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_depth_bias_state[] = {
|
|
MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_depth_bias_size(struct tu_device *dev,
|
|
const struct vk_rasterization_state *rs)
|
|
{
|
|
return 4;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
void
|
|
tu6_emit_depth_bias(struct tu_cs *cs, const struct vk_rasterization_state *rs)
|
|
{
|
|
tu_cs_emit_regs(cs,
|
|
GRAS_SU_POLY_OFFSET_SCALE(CHIP, rs->depth_bias.slope_factor),
|
|
GRAS_SU_POLY_OFFSET_OFFSET(CHIP, rs->depth_bias.constant_factor),
|
|
GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(CHIP, rs->depth_bias.clamp));
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_bandwidth_state[] = {
|
|
MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
|
|
MESA_VK_DYNAMIC_CB_LOGIC_OP,
|
|
MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
|
|
MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
|
|
MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
|
|
MESA_VK_DYNAMIC_CB_WRITE_MASKS,
|
|
};
|
|
|
|
static void
|
|
tu_calc_bandwidth(struct tu_bandwidth *bandwidth,
|
|
const struct vk_color_blend_state *cb,
|
|
const struct vk_render_pass_state *rp)
|
|
{
|
|
bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
|
|
|
|
uint32_t total_bpp = 0;
|
|
for (unsigned i = 0; i < cb->attachment_count; i++) {
|
|
const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
|
|
if (!(cb->color_write_enables & (1u << i)))
|
|
continue;
|
|
|
|
const VkFormat format = rp->color_attachment_formats[i];
|
|
|
|
uint32_t write_bpp = 0;
|
|
if (format == VK_FORMAT_UNDEFINED) {
|
|
/* do nothing */
|
|
} else if (att->write_mask == 0xf) {
|
|
write_bpp = vk_format_get_blocksizebits(format);
|
|
} else {
|
|
const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
|
|
for (uint32_t i = 0; i < 4; i++) {
|
|
if (att->write_mask & (1 << i)) {
|
|
write_bpp += util_format_get_component_bits(pipe_format,
|
|
UTIL_FORMAT_COLORSPACE_RGB, i);
|
|
}
|
|
}
|
|
}
|
|
total_bpp += write_bpp;
|
|
|
|
if (rop_reads_dst || att->blend_enable) {
|
|
total_bpp += write_bpp;
|
|
}
|
|
}
|
|
|
|
bandwidth->color_bandwidth_per_sample = total_bpp / 8;
|
|
|
|
if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
|
|
bandwidth->depth_cpp_per_sample = util_format_get_component_bits(
|
|
vk_format_to_pipe_format(rp->depth_attachment_format),
|
|
UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
|
|
}
|
|
|
|
if (rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) {
|
|
bandwidth->stencil_cpp_per_sample = util_format_get_component_bits(
|
|
vk_format_to_pipe_format(rp->stencil_attachment_format),
|
|
UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
|
|
}
|
|
|
|
bandwidth->valid = true;
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_disable_fs_state[] = {
|
|
MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
|
|
MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
|
|
MESA_VK_DYNAMIC_CB_WRITE_MASKS,
|
|
MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE,
|
|
};
|
|
|
|
static bool
|
|
tu_calc_disable_fs(const struct vk_color_blend_state *cb,
|
|
const struct vk_render_pass_state *rp,
|
|
bool alpha_to_coverage_enable,
|
|
const struct tu_shader *fs)
|
|
{
|
|
if (alpha_to_coverage_enable)
|
|
return false;
|
|
if (fs && !fs->variant->writes_only_color)
|
|
return false;
|
|
|
|
bool has_enabled_attachments = false;
|
|
for (unsigned i = 0; i < cb->attachment_count; i++) {
|
|
if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
|
|
continue;
|
|
|
|
const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
|
|
if ((cb->color_write_enables & (1u << i)) && att->write_mask != 0) {
|
|
has_enabled_attachments = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return !fs || fs->variant->empty ||
|
|
(fs->variant->writes_only_color && !has_enabled_attachments);
|
|
}
|
|
|
|
static void
|
|
tu_emit_disable_fs(struct tu_disable_fs *disable_fs,
|
|
const struct vk_color_blend_state *cb,
|
|
const struct vk_render_pass_state *rp,
|
|
bool alpha_to_coverage_enable,
|
|
const struct tu_shader *fs)
|
|
{
|
|
disable_fs->disable_fs =
|
|
tu_calc_disable_fs(cb, rp, alpha_to_coverage_enable, fs);
|
|
disable_fs->valid = true;
|
|
}
|
|
|
|
/* Return true if the blend state reads the color attachments. */
|
|
static tu_lrz_blend_status
|
|
tu6_calc_blend_lrz(const struct vk_color_blend_state *cb,
|
|
const struct vk_render_pass_state *rp)
|
|
{
|
|
if (cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op))
|
|
return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;
|
|
|
|
uint32_t written_color_attachments = 0;
|
|
uint32_t total_color_attachments = 0;
|
|
for (unsigned i = 0; i < cb->attachment_count; i++) {
|
|
if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
|
|
continue;
|
|
|
|
total_color_attachments++;
|
|
const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
|
|
if ((cb->color_write_enables & (1u << i)) && att->write_mask != 0) {
|
|
written_color_attachments++;
|
|
}
|
|
}
|
|
|
|
if (total_color_attachments == 0)
|
|
return TU_LRZ_BLEND_SAFE_FOR_LRZ;
|
|
|
|
if (written_color_attachments == 0)
|
|
return TU_LRZ_BLEND_ALL_COLOR_WRITES_SKIPPED;
|
|
|
|
if (written_color_attachments < cb->attachment_count)
|
|
return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;
|
|
|
|
for (unsigned i = 0; i < cb->attachment_count; i++) {
|
|
if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
|
|
continue;
|
|
|
|
const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
|
|
if (att->blend_enable)
|
|
return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;
|
|
if (!(cb->color_write_enables & (1u << i)))
|
|
return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;
|
|
unsigned mask =
|
|
MASK(vk_format_get_nr_components(rp->color_attachment_formats[i]));
|
|
if ((att->write_mask & mask) != mask)
|
|
return TU_LRZ_BLEND_READS_DEST_OR_PARTIAL_WRITE;
|
|
}
|
|
|
|
return TU_LRZ_BLEND_SAFE_FOR_LRZ;
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_blend_lrz_state[] = {
|
|
MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
|
|
MESA_VK_DYNAMIC_CB_LOGIC_OP,
|
|
MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
|
|
MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
|
|
MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
|
|
MESA_VK_DYNAMIC_CB_WRITE_MASKS,
|
|
};
|
|
|
|
static void
|
|
tu_emit_blend_lrz(struct tu_lrz_blend *lrz,
|
|
const struct vk_color_blend_state *cb,
|
|
const struct vk_render_pass_state *rp)
|
|
{
|
|
lrz->lrz_blend_status = tu6_calc_blend_lrz(cb, rp);
|
|
lrz->valid = true;
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_blend_state[] = {
|
|
MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
|
|
MESA_VK_DYNAMIC_CB_LOGIC_OP,
|
|
MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
|
|
MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
|
|
MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
|
|
MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS,
|
|
MESA_VK_DYNAMIC_CB_WRITE_MASKS,
|
|
MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE,
|
|
MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE,
|
|
MESA_VK_DYNAMIC_MS_SAMPLE_MASK,
|
|
MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_blend_size(struct tu_device *dev,
|
|
const struct vk_color_blend_state *cb,
|
|
const struct vk_color_attachment_location_state *cal,
|
|
const struct vk_render_pass_state *rp,
|
|
bool alpha_to_coverage_enable,
|
|
bool alpha_to_one_enable,
|
|
uint32_t sample_mask)
|
|
{
|
|
unsigned num_rts = alpha_to_coverage_enable ?
|
|
MAX2(cb->attachment_count, 1) : cb->attachment_count;
|
|
return 8 + 3 * num_rts;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_blend(struct tu_cs *cs,
|
|
const struct vk_color_blend_state *cb,
|
|
const struct vk_color_attachment_location_state *cal,
|
|
const struct vk_render_pass_state *rp,
|
|
bool alpha_to_coverage_enable,
|
|
bool alpha_to_one_enable,
|
|
uint32_t sample_mask)
|
|
{
|
|
bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
|
|
enum a3xx_rop_code rop = tu6_rop((VkLogicOp)cb->logic_op);
|
|
|
|
uint32_t blend_enable_mask = 0;
|
|
for (unsigned i = 0; i < cb->attachment_count; i++) {
|
|
if (!(cb->color_write_enables & (1u << i)) ||
|
|
cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
|
|
continue;
|
|
|
|
const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
|
|
VkFormat att_format = rp->color_attachment_formats[i];
|
|
bool is_float_or_srgb = vk_format_is_float(att_format) || vk_format_is_srgb(att_format);
|
|
|
|
/* Logic op overrides any blending. Even when logic op is present, blending
|
|
* should be kept disabled for any ops that don't read dst values or for
|
|
* attachments of float or sRGB formats.
|
|
*/
|
|
if ((att->blend_enable && !cb->logic_op_enable) || (rop_reads_dst && !is_float_or_srgb)) {
|
|
blend_enable_mask |= 1u << cal->color_map[i];
|
|
}
|
|
}
|
|
|
|
/* This will emit a dummy RB_MRT_*_CONTROL below if alpha-to-coverage is
|
|
* enabled but there are no color attachments, in addition to changing
|
|
* *_FS_OUTPUT_CNTL1.
|
|
*/
|
|
unsigned num_rts = alpha_to_coverage_enable ?
|
|
MAX2(cb->attachment_count, 1) : cb->attachment_count;
|
|
|
|
bool dual_src_blend = tu_blend_state_is_dual_src(cb);
|
|
|
|
tu_cs_emit_regs(cs, SP_BLEND_CNTL(CHIP, .enable_blend = blend_enable_mask,
|
|
.independent_blend_en = true,
|
|
.dual_color_in_enable =
|
|
dual_src_blend,
|
|
.alpha_to_coverage =
|
|
alpha_to_coverage_enable));
|
|
/* TODO: set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled?
|
|
*
|
|
* We could also set blend_reads_dest more conservatively, but it didn't show
|
|
* performance wins in anholt's testing:
|
|
* https://gitlab.freedesktop.org/anholt/mesa/-/commits/tu-color-reads
|
|
*/
|
|
tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.blend_reads_dest = blend_enable_mask,
|
|
.independent_blend = true,
|
|
.dual_color_in_enable =
|
|
dual_src_blend,
|
|
.alpha_to_coverage =
|
|
alpha_to_coverage_enable,
|
|
.alpha_to_one = alpha_to_one_enable,
|
|
.sample_mask = sample_mask));
|
|
|
|
unsigned num_remapped_rts = 0;
|
|
for (unsigned i = 0; i < num_rts; i++) {
|
|
if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
|
|
continue;
|
|
unsigned remapped_idx = cal->color_map[i];
|
|
num_remapped_rts = MAX2(num_remapped_rts, remapped_idx + 1);
|
|
const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
|
|
if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
|
|
const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
|
|
const enum adreno_rb_blend_factor src_color_factor =
|
|
tu6_blend_factor((VkBlendFactor)att->src_color_blend_factor);
|
|
const enum adreno_rb_blend_factor dst_color_factor =
|
|
tu6_blend_factor((VkBlendFactor)att->dst_color_blend_factor);
|
|
const enum a3xx_rb_blend_opcode alpha_op =
|
|
tu6_blend_op(att->alpha_blend_op);
|
|
const enum adreno_rb_blend_factor src_alpha_factor =
|
|
tu6_blend_factor((VkBlendFactor)att->src_alpha_blend_factor);
|
|
const enum adreno_rb_blend_factor dst_alpha_factor =
|
|
tu6_blend_factor((VkBlendFactor)att->dst_alpha_blend_factor);
|
|
VkFormat att_format = rp->color_attachment_formats[i];
|
|
bool is_float_or_srgb = vk_format_is_float(att_format) || vk_format_is_srgb(att_format);
|
|
|
|
/* Keep blend and logic op flags tidy. These conditions match the blend-enable
|
|
* mask construction above, except for the dst-reading rop condition that doesn't
|
|
* apply here.
|
|
*/
|
|
bool blend_enable = att->blend_enable && !cb->logic_op_enable;
|
|
bool logic_op_enable = cb->logic_op_enable && !is_float_or_srgb;
|
|
|
|
tu_cs_emit_regs(cs,
|
|
A6XX_RB_MRT_CONTROL(remapped_idx,
|
|
.color_blend_en = blend_enable,
|
|
.alpha_blend_en = blend_enable,
|
|
.rop_enable = logic_op_enable,
|
|
.rop_code = rop,
|
|
.component_enable = att->write_mask),
|
|
A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,
|
|
.rgb_src_factor = src_color_factor,
|
|
.rgb_blend_opcode = color_op,
|
|
.rgb_dest_factor = dst_color_factor,
|
|
.alpha_src_factor = src_alpha_factor,
|
|
.alpha_blend_opcode = alpha_op,
|
|
.alpha_dest_factor = dst_alpha_factor));
|
|
} else {
|
|
tu_cs_emit_regs(cs,
|
|
A6XX_RB_MRT_CONTROL(remapped_idx,),
|
|
A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
|
|
}
|
|
}
|
|
tu_cs_emit_regs(cs, A6XX_SP_PS_MRT_CNTL(.mrt = num_remapped_rts));
|
|
tu_cs_emit_regs(cs, A6XX_RB_PS_MRT_CNTL(.mrt = num_remapped_rts));
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
|
|
MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_blend_constants_size(struct tu_device *dev,
|
|
const struct vk_color_blend_state *cb)
|
|
{
|
|
return 5;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_blend_constants(struct tu_cs *cs, const struct vk_color_blend_state *cb)
|
|
{
|
|
tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_CONSTANT_RED_FP32, 4);
|
|
tu_cs_emit_array(cs, (const uint32_t *) cb->blend_constants, 4);
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_rast_state[] = {
|
|
MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
|
|
MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE,
|
|
MESA_VK_DYNAMIC_RS_POLYGON_MODE,
|
|
MESA_VK_DYNAMIC_RS_CULL_MODE,
|
|
MESA_VK_DYNAMIC_RS_FRONT_FACE,
|
|
MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE,
|
|
MESA_VK_DYNAMIC_RS_LINE_MODE,
|
|
MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE,
|
|
MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM,
|
|
MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
|
|
MESA_VK_DYNAMIC_RS_LINE_WIDTH,
|
|
MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE,
|
|
MESA_VK_DYNAMIC_RS_EXTRA_PRIMITIVE_OVERESTIMATION_SIZE,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
uint32_t
|
|
tu6_rast_size(struct tu_device *dev,
|
|
const struct vk_rasterization_state *rs,
|
|
const struct vk_viewport_state *vp,
|
|
bool multiview,
|
|
bool per_view_viewport,
|
|
bool disable_fs)
|
|
{
|
|
if (CHIP == A6XX && dev->physical_device->info->props.is_a702) {
|
|
return 17;
|
|
} else if (CHIP == A6XX) {
|
|
return 15 + (dev->physical_device->info->props.has_legacy_pipeline_shading_rate ? 8 : 0);
|
|
} else {
|
|
return 27;
|
|
}
|
|
}
|
|
|
|
template <chip CHIP>
|
|
void
|
|
tu6_emit_rast(struct tu_cs *cs,
|
|
const struct vk_rasterization_state *rs,
|
|
const struct vk_viewport_state *vp,
|
|
bool multiview,
|
|
bool per_view_viewport,
|
|
bool disable_fs)
|
|
{
|
|
enum a5xx_line_mode line_mode =
|
|
rs->line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR ?
|
|
BRESENHAM : RECTANGULAR;
|
|
tu_cs_emit_regs(cs,
|
|
GRAS_SU_CNTL(CHIP,
|
|
.cull_front = rs->cull_mode & VK_CULL_MODE_FRONT_BIT,
|
|
.cull_back = rs->cull_mode & VK_CULL_MODE_BACK_BIT,
|
|
.front_cw = rs->front_face == VK_FRONT_FACE_CLOCKWISE,
|
|
.linehalfwidth = rs->line.width / 2.0f,
|
|
.poly_offset = rs->depth_bias.enable,
|
|
.line_mode = line_mode,
|
|
.multiview_enable = multiview,
|
|
.rendertargetindexincr = multiview,
|
|
.viewportindexincr = multiview && per_view_viewport));
|
|
|
|
bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs);
|
|
|
|
tu_cs_emit_regs(cs,
|
|
GRAS_CL_CNTL(CHIP,
|
|
.znear_clip_disable = !depth_clip_enable,
|
|
.zfar_clip_disable = !depth_clip_enable,
|
|
/* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
|
|
.z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
|
|
.zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1,
|
|
.vp_clip_code_ignore = 1));;
|
|
|
|
enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode);
|
|
|
|
tu_cs_emit_regs(cs, VPC_RAST_CNTL(CHIP, polygon_mode));
|
|
|
|
tu_cs_emit_regs(cs,
|
|
PC_DGEN_RAST_CNTL(CHIP, polygon_mode));
|
|
|
|
if (CHIP == A7XX || cs->device->physical_device->info->props.is_a702) {
|
|
tu_cs_emit_regs(cs, VPC_PS_RAST_CNTL(CHIP, polygon_mode));
|
|
}
|
|
|
|
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL(CHIP,
|
|
.stream = rs->rasterization_stream,
|
|
.discard = rs->rasterizer_discard_enable));
|
|
if (CHIP == A6XX) {
|
|
tu_cs_emit_regs(cs, VPC_UNKNOWN_9107(CHIP,
|
|
.raster_discard = rs->rasterizer_discard_enable));
|
|
} else {
|
|
tu_cs_emit_regs(cs, VPC_RAST_STREAM_CNTL_V2(CHIP,
|
|
.stream = rs->rasterization_stream,
|
|
.discard = rs->rasterizer_discard_enable));
|
|
|
|
bool conservative_ras_en =
|
|
rs->conservative_mode ==
|
|
VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
|
|
/* This is important to get D/S only draw calls to bypass invoking
|
|
* the fragment shader. The public documentation for Adreno states:
|
|
* "Hint the driver to engage Fast-Z by using an empty fragment
|
|
* shader and disabling frame buffer write masks for renderpasses
|
|
* that modify Z values only."
|
|
* "The GPU has a special mode that writes Z-only pixels at twice
|
|
* the normal rate."
|
|
*/
|
|
tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP,
|
|
.fs_disable = disable_fs,
|
|
.raster_mode = TYPE_TILED,
|
|
.raster_direction = LR_TB,
|
|
.conservativerasen = conservative_ras_en));
|
|
|
|
if (CHIP >= A7XX) {
|
|
tu_cs_emit_regs(cs, GRAS_SU_RENDER_CNTL(CHIP, .fs_disable = disable_fs));
|
|
tu_cs_emit_regs(cs, SP_RENDER_CNTL(CHIP, .fs_disable = disable_fs));
|
|
}
|
|
|
|
tu_cs_emit_regs(
|
|
cs, PC_DGEN_SU_CONSERVATIVE_RAS_CNTL(CHIP, conservative_ras_en));
|
|
|
|
/* There are only two conservative rasterization modes:
|
|
* - shift_amount = 0 (NO_SHIFT) - normal rasterization
|
|
* - shift_amount = 1 (HALF_PIXEL_SHIFT) - overestimate by half a pixel
|
|
* plus the rasterization grid size (1/256)
|
|
* - shift_amount = 2 (FULL_PIXEL_SHIFT) - overestimate by another half
|
|
* a pixel
|
|
*
|
|
* We expose a max of 0.5 and a granularity of 0.5, so the app should
|
|
* only give us 0 or 0.5 which correspond to HALF_PIXEL_SHIFT and
|
|
* FULL_PIXEL_SHIFT respectively. If they give us anything else just
|
|
* assume they meant 0.5 as the most conservative choice.
|
|
*/
|
|
enum a6xx_shift_amount shift_amount = conservative_ras_en ?
|
|
(rs->extra_primitive_overestimation_size != 0. ?
|
|
FULL_PIXEL_SHIFT : HALF_PIXEL_SHIFT) : NO_SHIFT;
|
|
tu_cs_emit_regs(cs, GRAS_SU_CONSERVATIVE_RAS_CNTL(CHIP,
|
|
.conservativerasen = conservative_ras_en,
|
|
.shiftamount = shift_amount));
|
|
}
|
|
|
|
/* move to hw ctx init? */
|
|
tu_cs_emit_regs(cs,
|
|
GRAS_SU_POINT_MINMAX(CHIP, .min = 1.0f / 16.0f, .max = 4092.0f),
|
|
GRAS_SU_POINT_SIZE(CHIP, 1.0f));
|
|
|
|
if (CHIP == A6XX && cs->device->physical_device->info->props.has_legacy_pipeline_shading_rate) {
|
|
tu_cs_emit_regs(cs, RB_UNKNOWN_8A00(CHIP));
|
|
tu_cs_emit_regs(cs, RB_UNKNOWN_8A10(CHIP));
|
|
tu_cs_emit_regs(cs, RB_UNKNOWN_8A20(CHIP));
|
|
tu_cs_emit_regs(cs, RB_UNKNOWN_8A30(CHIP));
|
|
}
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_ds_state[] = {
|
|
MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE,
|
|
MESA_VK_DYNAMIC_DS_STENCIL_OP,
|
|
MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK,
|
|
MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK,
|
|
MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE,
|
|
MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_ds_size(struct tu_device *dev,
|
|
const struct vk_depth_stencil_state *ds,
|
|
const struct vk_render_pass_state *rp)
|
|
{
|
|
return 13;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_ds(struct tu_cs *cs,
|
|
const struct vk_depth_stencil_state *ds,
|
|
const struct vk_render_pass_state *rp)
|
|
{
|
|
bool stencil_test_enable =
|
|
ds->stencil.test_enable && rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
|
|
|
|
/* While the .stencil_read field can be used to avoid having to read stencil
|
|
* when the func/ops cause it to be unused, there was no change in perf on
|
|
* the 1/42 games tested that was affected (Transport Fever, 0.0 +/- 0.0%
|
|
* change). Besides, in some cases where we could clear stencil_read here,
|
|
* the packed z/s is going to be read anyway due to depth testing, though
|
|
* that doesn't apply to this game.
|
|
*
|
|
* Given that the condition for avoiding stencil_read is fairly complicated,
|
|
* we won't bother with the CPU overhead until we can see some win from it.
|
|
*
|
|
* https://gitlab.freedesktop.org/anholt/mesa/-/commits/tu-s-reads
|
|
*/
|
|
|
|
tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CNTL(
|
|
.stencil_enable = stencil_test_enable,
|
|
.stencil_enable_bf = stencil_test_enable,
|
|
.stencil_read = stencil_test_enable,
|
|
.func = tu6_compare_func((VkCompareOp)ds->stencil.front.op.compare),
|
|
.fail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.fail),
|
|
.zpass = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.pass),
|
|
.zfail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.depth_fail),
|
|
.func_bf = tu6_compare_func((VkCompareOp)ds->stencil.back.op.compare),
|
|
.fail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.fail),
|
|
.zpass_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.pass),
|
|
.zfail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.depth_fail)));
|
|
tu_cs_emit_regs(cs, GRAS_SU_STENCIL_CNTL(CHIP, stencil_test_enable));
|
|
|
|
tu_cs_emit_regs(cs, A6XX_RB_STENCIL_MASK(
|
|
.mask = ds->stencil.front.compare_mask,
|
|
.bfmask = ds->stencil.back.compare_mask));
|
|
|
|
tu_cs_emit_regs(cs, A6XX_RB_STENCIL_WRITE_MASK(
|
|
.wrmask = ds->stencil.front.write_mask,
|
|
.bfwrmask = ds->stencil.back.write_mask));
|
|
|
|
tu_cs_emit_regs(cs, A6XX_RB_STENCIL_REF_CNTL(
|
|
.ref = ds->stencil.front.reference,
|
|
.bfref = ds->stencil.back.reference));
|
|
|
|
tu_cs_emit_regs(cs,
|
|
A6XX_RB_DEPTH_BOUND_MIN(ds->depth.bounds_test.min),
|
|
A6XX_RB_DEPTH_BOUND_MAX(ds->depth.bounds_test.max));
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_rb_depth_cntl_state[] = {
|
|
MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE,
|
|
MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE,
|
|
MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP,
|
|
MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE,
|
|
MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_rb_depth_cntl_size(struct tu_device *dev,
|
|
const struct vk_depth_stencil_state *ds,
|
|
const struct vk_render_pass_state *rp,
|
|
const struct vk_rasterization_state *rs)
|
|
{
|
|
return 4;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_rb_depth_cntl(struct tu_cs *cs,
|
|
const struct vk_depth_stencil_state *ds,
|
|
const struct vk_render_pass_state *rp,
|
|
const struct vk_rasterization_state *rs)
|
|
{
|
|
if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
|
|
bool depth_test = ds->depth.test_enable;
|
|
enum adreno_compare_func zfunc = tu6_compare_func(ds->depth.compare_op);
|
|
|
|
/* On some GPUs it is necessary to enable z test for depth bounds test
|
|
* when UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is
|
|
* required to pass z test. Relevant tests:
|
|
* dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
|
|
* dEQP-VK.dynamic_state.ds_state.depth_bounds_1
|
|
*/
|
|
if (ds->depth.bounds_test.enable &&
|
|
!ds->depth.test_enable &&
|
|
cs->device->physical_device->info->props.depth_bounds_require_depth_test_quirk) {
|
|
depth_test = true;
|
|
zfunc = FUNC_ALWAYS;
|
|
}
|
|
|
|
tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
|
|
.z_test_enable = depth_test,
|
|
.z_write_enable = ds->depth.test_enable && ds->depth.write_enable,
|
|
.zfunc = zfunc,
|
|
/* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
|
|
.z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
|
|
.z_read_enable =
|
|
(ds->depth.test_enable && (zfunc != FUNC_NEVER && zfunc != FUNC_ALWAYS)) ||
|
|
ds->depth.bounds_test.enable,
|
|
.z_bounds_enable = ds->depth.bounds_test.enable));
|
|
tu_cs_emit_regs(cs, GRAS_SU_DEPTH_CNTL(CHIP, depth_test));
|
|
} else {
|
|
tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
|
|
tu_cs_emit_regs(cs, GRAS_SU_DEPTH_CNTL(CHIP));
|
|
}
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_prim_mode_sysmem_state[] = {
|
|
MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_prim_mode_sysmem_size(struct tu_device *dev,
|
|
struct tu_shader *fs,
|
|
bool raster_order_attachment_access,
|
|
VkImageAspectFlags feedback_loops,
|
|
bool *sysmem_single_prim_mode)
|
|
{
|
|
return 2;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_prim_mode_sysmem(struct tu_cs *cs,
|
|
struct tu_shader *fs,
|
|
bool raster_order_attachment_access,
|
|
VkImageAspectFlags feedback_loops,
|
|
bool *sysmem_single_prim_mode)
|
|
{
|
|
/* VK_EXT_rasterization_order_attachment_access:
|
|
*
|
|
* This extension allow access to framebuffer attachments when used as both
|
|
* input and color attachments from one fragment to the next, in
|
|
* rasterization order, without explicit synchronization.
|
|
*/
|
|
raster_order_attachment_access |= TU_DEBUG(RAST_ORDER);
|
|
|
|
/* If there is a feedback loop, then the shader can read the previous value
|
|
* of a pixel being written out. It can also write some components and then
|
|
* read different components without a barrier in between. This is a
|
|
* problem in sysmem mode with UBWC, because the main buffer and flags
|
|
* buffer can get out-of-sync if only one is flushed. We fix this by
|
|
* setting the SINGLE_PRIM_MODE field to the same value that the blob does
|
|
* for advanced_blend in sysmem mode if a feedback loop is detected.
|
|
*/
|
|
enum a6xx_single_prim_mode sysmem_prim_mode =
|
|
(raster_order_attachment_access || feedback_loops ||
|
|
fs->fs.dynamic_input_attachments_used) ?
|
|
FLUSH_PER_OVERLAP_AND_OVERWRITE : NO_FLUSH;
|
|
|
|
if (sysmem_prim_mode == FLUSH_PER_OVERLAP_AND_OVERWRITE)
|
|
*sysmem_single_prim_mode = true;
|
|
|
|
tu_cs_emit_regs(cs, GRAS_SC_CNTL(CHIP,
|
|
.single_prim_mode = sysmem_prim_mode,
|
|
.ccusinglecachelinesize = 2,
|
|
));
|
|
}
|
|
|
|
static const enum mesa_vk_dynamic_graphics_state tu_fragment_shading_rate_state[] = {
|
|
MESA_VK_DYNAMIC_FSR,
|
|
};
|
|
|
|
template <chip CHIP>
|
|
static unsigned
|
|
tu6_fragment_shading_rate_size(struct tu_device *dev,
|
|
const vk_fragment_shading_rate_state *fsr,
|
|
bool enable_att_fsr,
|
|
bool enable_prim_fsr,
|
|
bool fs_reads_fsr)
|
|
{
|
|
return 6;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu6_emit_fragment_shading_rate(struct tu_cs *cs,
|
|
const vk_fragment_shading_rate_state *fsr,
|
|
bool enable_att_fsr,
|
|
bool enable_prim_fsr,
|
|
bool fs_reads_fsr)
|
|
{
|
|
/* gl_ShadingRateEXT don't read 1x1 value with null config, so
|
|
* if it is read - we have to emit the config.
|
|
*/
|
|
if (!fsr || (!fs_reads_fsr && vk_fragment_shading_rate_is_disabled(fsr))) {
|
|
tu_cs_emit_regs(cs, A6XX_RB_VRS_CONFIG());
|
|
tu_cs_emit_regs(cs, SP_VRS_CONFIG(CHIP));
|
|
tu_cs_emit_regs(cs, GRAS_VRS_CONFIG(CHIP));
|
|
return;
|
|
}
|
|
|
|
uint32_t frag_width = fsr->fragment_size.width;
|
|
uint32_t frag_height = fsr->fragment_size.height;
|
|
|
|
bool enable_draw_fsr = true;
|
|
if (enable_att_fsr) {
|
|
if (fsr->combiner_ops[1] ==
|
|
VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
|
|
enable_draw_fsr = false;
|
|
enable_prim_fsr = false;
|
|
} else if (fsr->combiner_ops[1] ==
|
|
VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
|
|
enable_att_fsr = false;
|
|
}
|
|
}
|
|
if (enable_prim_fsr) {
|
|
if (fsr->combiner_ops[0] ==
|
|
VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
|
|
enable_draw_fsr = false;
|
|
} else if (fsr->combiner_ops[0] ==
|
|
VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
|
|
enable_prim_fsr = false;
|
|
}
|
|
}
|
|
|
|
tu_cs_emit_regs(
|
|
cs,
|
|
A6XX_RB_VRS_CONFIG(.unk2 = true, .pipeline_fsr_enable = enable_draw_fsr,
|
|
.attachment_fsr_enable = enable_att_fsr,
|
|
.primitive_fsr_enable = enable_prim_fsr));
|
|
tu_cs_emit_regs(cs,
|
|
SP_VRS_CONFIG(CHIP, .pipeline_fsr_enable = enable_draw_fsr,
|
|
.attachment_fsr_enable = enable_att_fsr,
|
|
.primitive_fsr_enable = enable_prim_fsr));
|
|
tu_cs_emit_regs(
|
|
cs, GRAS_VRS_CONFIG(CHIP,
|
|
.pipeline_fsr_enable = enable_draw_fsr,
|
|
.frag_size_x = util_logbase2(frag_width),
|
|
.frag_size_y = util_logbase2(frag_height),
|
|
.combiner_op_1 = (a6xx_fsr_combiner) fsr->combiner_ops[0],
|
|
.combiner_op_2 = (a6xx_fsr_combiner) fsr->combiner_ops[1],
|
|
.attachment_fsr_enable = enable_att_fsr,
|
|
.primitive_fsr_enable = enable_prim_fsr));
|
|
}
|
|
|
|
|
|
static inline bool
|
|
emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove,
|
|
BITSET_WORD *pipeline_set,
|
|
const enum mesa_vk_dynamic_graphics_state *state_array,
|
|
unsigned num_states, bool extra_cond,
|
|
struct tu_pipeline_builder *builder)
|
|
{
|
|
BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
|
|
|
|
/* Unrolling this loop should produce a constant value once the function is
|
|
* inlined, because state_array and num_states are a per-draw-state
|
|
* constant, but GCC seems to need a little encouragement. clang does a
|
|
* little better but still needs a pragma when there are a large number of
|
|
* states.
|
|
*/
|
|
#if defined(__clang__)
|
|
#pragma clang loop unroll(full)
|
|
#elif defined(__GNUC__) && __GNUC__ >= 8
|
|
#pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
|
|
#endif
|
|
for (unsigned i = 0; i < num_states; i++) {
|
|
BITSET_SET(state, state_array[i]);
|
|
}
|
|
|
|
/* If all of the state is set, then after we emit it we can tentatively
|
|
* remove it from the states to set for the pipeline by making it dynamic.
|
|
* If we can't emit it, though, we need to keep around the partial state so
|
|
* that we can emit it later, even if another draw state consumes it. That
|
|
* is, we have to cancel any tentative removal.
|
|
*/
|
|
BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
|
|
memcpy(temp, pipeline_set, sizeof(temp));
|
|
BITSET_AND(temp, temp, state);
|
|
if (!BITSET_EQUAL(temp, state) || !extra_cond) {
|
|
__bitset_or(keep, keep, temp, ARRAY_SIZE(temp));
|
|
return false;
|
|
}
|
|
__bitset_or(remove, remove, state, ARRAY_SIZE(state));
|
|
return true;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder,
|
|
struct tu_pipeline *pipeline)
|
|
{
|
|
struct tu_cs cs;
|
|
BITSET_DECLARE(keep, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
|
|
BITSET_DECLARE(remove, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
|
|
BITSET_DECLARE(pipeline_set, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
|
|
|
|
vk_graphics_pipeline_get_state(&builder->graphics_state, pipeline_set);
|
|
|
|
#define EMIT_STATE(name, extra_cond) \
|
|
emit_pipeline_state(keep, remove, pipeline_set, tu_##name##_state, \
|
|
ARRAY_SIZE(tu_##name##_state), extra_cond, builder)
|
|
|
|
#define DRAW_STATE_COND(name, id, extra_cond, ...) \
|
|
if (EMIT_STATE(name, extra_cond)) { \
|
|
unsigned size = tu6_##name##_size<CHIP>(builder->device, __VA_ARGS__); \
|
|
if (size > 0) { \
|
|
tu_cs_begin_sub_stream(&pipeline->cs, size, &cs); \
|
|
tu6_emit_##name<CHIP>(&cs, __VA_ARGS__); \
|
|
pipeline->dynamic_state[id] = \
|
|
tu_cs_end_draw_state(&pipeline->cs, &cs); \
|
|
} \
|
|
pipeline->set_state_mask |= (1u << id); \
|
|
}
|
|
#define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, true, __VA_ARGS__)
|
|
|
|
DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
|
|
builder->graphics_state.vi);
|
|
DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
|
|
builder->graphics_state.vi);
|
|
/* If (a) per-view viewport is used or (b) we don't know yet, then we need
|
|
* to set viewport and stencil state dynamically.
|
|
*/
|
|
bool no_per_view_viewport = pipeline_contains_all_shader_state(pipeline) &&
|
|
!pipeline->program.per_view_viewport &&
|
|
!pipeline->program.per_layer_viewport;
|
|
DRAW_STATE_COND(viewport, TU_DYNAMIC_STATE_VIEWPORT, no_per_view_viewport,
|
|
builder->graphics_state.vp,
|
|
builder->graphics_state.rs);
|
|
DRAW_STATE_COND(scissor, TU_DYNAMIC_STATE_SCISSOR, no_per_view_viewport,
|
|
builder->graphics_state.vp);
|
|
DRAW_STATE(sample_locations,
|
|
TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
|
|
builder->graphics_state.ms->sample_locations_enable,
|
|
builder->graphics_state.ms->sample_locations);
|
|
DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
|
|
builder->graphics_state.rs);
|
|
bool attachments_valid =
|
|
builder->graphics_state.rp &&
|
|
vk_render_pass_state_has_attachment_info(builder->graphics_state.rp);
|
|
struct vk_color_blend_state dummy_cb = {};
|
|
const struct vk_color_blend_state *cb = builder->graphics_state.cb;
|
|
if (attachments_valid &&
|
|
!(builder->graphics_state.rp->attachments &
|
|
MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
|
|
/* If there are no color attachments, then the original blend state may
|
|
* be NULL and the common code sanitizes it to always be NULL. In this
|
|
* case we want to emit an empty blend/bandwidth/etc. rather than
|
|
* letting it be dynamic (and potentially garbage).
|
|
*/
|
|
cb = &dummy_cb;
|
|
BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
|
|
BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP);
|
|
BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
|
|
BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
|
|
BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
|
|
BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
|
|
BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
|
|
BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
|
|
}
|
|
DRAW_STATE_COND(blend, TU_DYNAMIC_STATE_BLEND, attachments_valid, cb,
|
|
builder->graphics_state.cal,
|
|
builder->graphics_state.rp,
|
|
builder->graphics_state.ms->alpha_to_coverage_enable,
|
|
builder->graphics_state.ms->alpha_to_one_enable,
|
|
builder->graphics_state.ms->sample_mask);
|
|
if (EMIT_STATE(blend_lrz, attachments_valid))
|
|
tu_emit_blend_lrz(&pipeline->lrz_blend, cb,
|
|
builder->graphics_state.rp);
|
|
if (EMIT_STATE(bandwidth, attachments_valid))
|
|
tu_calc_bandwidth(&pipeline->bandwidth, cb,
|
|
builder->graphics_state.rp);
|
|
if (EMIT_STATE(
|
|
disable_fs,
|
|
attachments_valid && pipeline_contains_all_shader_state(pipeline)))
|
|
tu_emit_disable_fs(&pipeline->disable_fs, cb,
|
|
builder->graphics_state.rp,
|
|
builder->graphics_state.ms->alpha_to_coverage_enable,
|
|
pipeline->shaders[MESA_SHADER_FRAGMENT]);
|
|
DRAW_STATE(blend_constants, TU_DYNAMIC_STATE_BLEND_CONSTANTS, cb);
|
|
|
|
if (attachments_valid &&
|
|
!(builder->graphics_state.rp->attachments &
|
|
MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
|
|
/* Don't actually make anything dynamic as that may mean a partially-set
|
|
* state group where the group is NULL which angers common code.
|
|
*/
|
|
BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
|
|
BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP);
|
|
BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
|
|
BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
|
|
BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
|
|
BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
|
|
BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
|
|
BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
|
|
}
|
|
DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
|
|
pipeline_contains_all_shader_state(pipeline) &&
|
|
pipeline->disable_fs.valid,
|
|
builder->graphics_state.rs, builder->graphics_state.vp,
|
|
builder->graphics_state.rp->view_mask != 0,
|
|
pipeline->program.per_view_viewport,
|
|
pipeline->disable_fs.disable_fs);
|
|
DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
|
|
attachments_valid,
|
|
builder->graphics_state.ds,
|
|
builder->graphics_state.rp);
|
|
DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
|
|
attachments_valid,
|
|
builder->graphics_state.ds,
|
|
builder->graphics_state.rp,
|
|
builder->graphics_state.rs);
|
|
DRAW_STATE_COND(patch_control_points,
|
|
TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
|
|
pipeline_contains_all_shader_state(pipeline),
|
|
pipeline->shaders[MESA_SHADER_VERTEX],
|
|
pipeline->shaders[MESA_SHADER_TESS_CTRL],
|
|
pipeline->shaders[MESA_SHADER_TESS_EVAL],
|
|
&pipeline->program,
|
|
builder->graphics_state.ts->patch_control_points);
|
|
bool has_raster_order_state = false;
|
|
if (pipeline->type == TU_PIPELINE_GRAPHICS) {
|
|
has_raster_order_state = true;
|
|
} else {
|
|
struct tu_graphics_lib_pipeline *lib =
|
|
tu_pipeline_to_graphics_lib(pipeline);
|
|
has_raster_order_state =
|
|
(lib->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
|
|
(lib->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT);
|
|
}
|
|
if (!builder->device->physical_device->info->props.has_coherent_ubwc_flag_caches) {
|
|
DRAW_STATE_COND(prim_mode_sysmem,
|
|
TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
|
|
has_raster_order_state,
|
|
pipeline->shaders[MESA_SHADER_FRAGMENT],
|
|
pipeline->output.raster_order_attachment_access ||
|
|
pipeline->ds.raster_order_attachment_access,
|
|
vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags),
|
|
&pipeline->prim_order.sysmem_single_prim_mode);
|
|
}
|
|
|
|
if (builder->device->physical_device->info->props.has_attachment_shading_rate) {
|
|
bool has_fsr_att =
|
|
builder->graphics_state.pipeline_flags &
|
|
VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
|
|
DRAW_STATE_COND(fragment_shading_rate,
|
|
TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
|
|
attachments_valid && pipeline_contains_all_shader_state(pipeline),
|
|
builder->graphics_state.fsr,
|
|
has_fsr_att,
|
|
pipeline->program.writes_shading_rate,
|
|
pipeline->program.reads_shading_rate);
|
|
}
|
|
#undef DRAW_STATE
|
|
#undef DRAW_STATE_COND
|
|
#undef EMIT_STATE
|
|
|
|
/* LRZ always needs depth/stencil state at draw time */
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP);
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_OP);
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE);
|
|
|
|
/* MSAA needs line mode */
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_RS_LINE_MODE);
|
|
|
|
/* The patch control points is part of the draw */
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS);
|
|
|
|
/* Vertex buffer state needs to know the max valid binding */
|
|
BITSET_SET(keep, MESA_VK_DYNAMIC_VI_BINDINGS_VALID);
|
|
|
|
/* Remove state which has been emitted and we no longer need to set when
|
|
* binding the pipeline by making it "dynamic".
|
|
*/
|
|
BITSET_ANDNOT(remove, remove, keep);
|
|
|
|
BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask, remove);
|
|
|
|
BITSET_OR(builder->graphics_state.dynamic, builder->graphics_state.dynamic,
|
|
remove);
|
|
}
|
|
|
|
static inline bool
|
|
emit_draw_state(const struct vk_dynamic_graphics_state *dynamic_state,
|
|
const enum mesa_vk_dynamic_graphics_state *state_array,
|
|
unsigned num_states)
|
|
{
|
|
BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
|
|
|
|
/* Unrolling this loop should produce a constant value once the function is
|
|
* inlined, because state_array and num_states are a per-draw-state
|
|
* constant, but GCC seems to need a little encouragement. clang does a
|
|
* little better but still needs a pragma when there are a large number of
|
|
* states.
|
|
*/
|
|
#if defined(__clang__)
|
|
#pragma clang loop unroll(full)
|
|
#elif defined(__GNUC__) && __GNUC__ >= 8
|
|
#pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
|
|
#endif
|
|
for (unsigned i = 0; i < num_states; i++) {
|
|
BITSET_SET(state, state_array[i]);
|
|
}
|
|
|
|
BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
|
|
BITSET_AND(temp, state, dynamic_state->dirty);
|
|
return !BITSET_IS_EMPTY(temp);
|
|
}
|
|
|
|
template <chip CHIP>
|
|
uint32_t
|
|
tu_emit_draw_state(struct tu_cmd_buffer *cmd)
|
|
{
|
|
struct tu_cs cs;
|
|
uint32_t dirty_draw_states = 0;
|
|
|
|
#define EMIT_STATE(name) \
|
|
emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state, \
|
|
ARRAY_SIZE(tu_##name##_state))
|
|
#define DRAW_STATE_COND(name, id, extra_cond, ...) \
|
|
if ((EMIT_STATE(name) || (extra_cond)) && \
|
|
!(cmd->state.pipeline_draw_states & (1u << id))) { \
|
|
unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__); \
|
|
if (size > 0) { \
|
|
tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs); \
|
|
tu6_emit_##name<CHIP>(&cs, __VA_ARGS__); \
|
|
cmd->state.dynamic_state[id] = \
|
|
tu_cs_end_draw_state(&cmd->sub_cs, &cs); \
|
|
} else { \
|
|
cmd->state.dynamic_state[id] = {}; \
|
|
} \
|
|
dirty_draw_states |= (1u << id); \
|
|
}
|
|
#define DRAW_STATE_FDM(name, id, ...) \
|
|
if ((EMIT_STATE(name) || (cmd->state.dirty & \
|
|
(TU_CMD_DIRTY_FDM | \
|
|
TU_CMD_DIRTY_PER_VIEW_VIEWPORT))) && \
|
|
!(cmd->state.pipeline_draw_states & (1u << id))) { \
|
|
if (cmd->state.has_fdm || cmd->state.per_layer_viewport) { \
|
|
tu_cs_set_writeable(&cmd->sub_cs, true); \
|
|
tu6_emit_##name##_fdm(&cs, cmd, __VA_ARGS__); \
|
|
cmd->state.dynamic_state[id] = \
|
|
tu_cs_end_draw_state(&cmd->sub_cs, &cs); \
|
|
tu_cs_set_writeable(&cmd->sub_cs, false); \
|
|
} else { \
|
|
unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__); \
|
|
if (size > 0) { \
|
|
tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs); \
|
|
tu6_emit_##name<CHIP>(&cs, __VA_ARGS__); \
|
|
cmd->state.dynamic_state[id] = \
|
|
tu_cs_end_draw_state(&cmd->sub_cs, &cs); \
|
|
} else { \
|
|
cmd->state.dynamic_state[id] = {}; \
|
|
} \
|
|
tu_cs_begin_sub_stream(&cmd->sub_cs, \
|
|
tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__), \
|
|
&cs); \
|
|
tu6_emit_##name<CHIP>(&cs, __VA_ARGS__); \
|
|
cmd->state.dynamic_state[id] = \
|
|
tu_cs_end_draw_state(&cmd->sub_cs, &cs); \
|
|
} \
|
|
dirty_draw_states |= (1u << id); \
|
|
}
|
|
#define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, false, __VA_ARGS__)
|
|
|
|
DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
|
|
cmd->vk.dynamic_graphics_state.vi);
|
|
|
|
/* Vertex input stride is special because it's part of the vertex input in
|
|
* the pipeline but a separate array when it's dynamic state so we have to
|
|
* use two separate functions.
|
|
*/
|
|
#define tu6_emit_vertex_stride tu6_emit_vertex_stride_dyn
|
|
#define tu6_vertex_stride_size tu6_vertex_stride_size_dyn
|
|
|
|
DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
|
|
cmd->vk.dynamic_graphics_state.vi_binding_strides,
|
|
cmd->vk.dynamic_graphics_state.vi_bindings_valid);
|
|
|
|
#undef tu6_emit_vertex_stride
|
|
#undef tu6_vertex_stride_size
|
|
|
|
DRAW_STATE_FDM(viewport, TU_DYNAMIC_STATE_VIEWPORT,
|
|
&cmd->vk.dynamic_graphics_state.vp,
|
|
&cmd->vk.dynamic_graphics_state.rs);
|
|
DRAW_STATE_FDM(scissor, TU_DYNAMIC_STATE_SCISSOR,
|
|
&cmd->vk.dynamic_graphics_state.vp);
|
|
DRAW_STATE(sample_locations,
|
|
TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
|
|
cmd->vk.dynamic_graphics_state.ms.sample_locations_enable,
|
|
cmd->vk.dynamic_graphics_state.ms.sample_locations);
|
|
DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
|
|
&cmd->vk.dynamic_graphics_state.rs);
|
|
DRAW_STATE_COND(blend, TU_DYNAMIC_STATE_BLEND,
|
|
cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
|
|
&cmd->vk.dynamic_graphics_state.cb,
|
|
&cmd->vk.dynamic_graphics_state.cal,
|
|
&cmd->state.vk_rp,
|
|
cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable,
|
|
cmd->vk.dynamic_graphics_state.ms.alpha_to_one_enable,
|
|
cmd->vk.dynamic_graphics_state.ms.sample_mask);
|
|
if (!cmd->state.pipeline_blend_lrz &&
|
|
(EMIT_STATE(blend_lrz) || (cmd->state.dirty & TU_CMD_DIRTY_SUBPASS))) {
|
|
tu_lrz_blend_status blend_status = tu6_calc_blend_lrz(
|
|
&cmd->vk.dynamic_graphics_state.cb, &cmd->state.vk_rp);
|
|
if (blend_status != cmd->state.lrz_blend_status) {
|
|
cmd->state.lrz_blend_status = blend_status;
|
|
cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
|
|
}
|
|
}
|
|
if (!cmd->state.pipeline_bandwidth &&
|
|
(EMIT_STATE(bandwidth) || (cmd->state.dirty & TU_CMD_DIRTY_SUBPASS)))
|
|
tu_calc_bandwidth(&cmd->state.bandwidth, &cmd->vk.dynamic_graphics_state.cb,
|
|
&cmd->state.vk_rp);
|
|
|
|
if (!cmd->state.pipeline_disable_fs &&
|
|
(EMIT_STATE(disable_fs) ||
|
|
(cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS | TU_CMD_DIRTY_FS)))) {
|
|
bool disable_fs = tu_calc_disable_fs(
|
|
&cmd->vk.dynamic_graphics_state.cb, &cmd->state.vk_rp,
|
|
cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable,
|
|
cmd->state.shaders[MESA_SHADER_FRAGMENT]);
|
|
|
|
if (disable_fs != cmd->state.disable_fs) {
|
|
cmd->state.disable_fs = disable_fs;
|
|
cmd->state.dirty |= TU_CMD_DIRTY_DISABLE_FS;
|
|
}
|
|
}
|
|
|
|
DRAW_STATE(blend_constants, VK_DYNAMIC_STATE_BLEND_CONSTANTS,
|
|
&cmd->vk.dynamic_graphics_state.cb);
|
|
|
|
if (cmd->device->physical_device->info->props.has_attachment_shading_rate) {
|
|
DRAW_STATE_COND(fragment_shading_rate,
|
|
TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
|
|
cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS | TU_CMD_DIRTY_SHADING_RATE),
|
|
&cmd->vk.dynamic_graphics_state.fsr,
|
|
cmd->state.subpass->fsr_attachment != VK_ATTACHMENT_UNUSED,
|
|
cmd->state.program.writes_shading_rate,
|
|
cmd->state.program.reads_shading_rate);
|
|
}
|
|
DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
|
|
cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS |
|
|
TU_CMD_DIRTY_PER_VIEW_VIEWPORT |
|
|
TU_CMD_DIRTY_DISABLE_FS),
|
|
&cmd->vk.dynamic_graphics_state.rs,
|
|
&cmd->vk.dynamic_graphics_state.vp,
|
|
cmd->state.vk_rp.view_mask != 0,
|
|
cmd->state.per_view_viewport,
|
|
cmd->state.disable_fs);
|
|
DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
|
|
cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
|
|
&cmd->vk.dynamic_graphics_state.ds,
|
|
&cmd->state.vk_rp);
|
|
DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
|
|
cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
|
|
&cmd->vk.dynamic_graphics_state.ds,
|
|
&cmd->state.vk_rp,
|
|
&cmd->vk.dynamic_graphics_state.rs);
|
|
DRAW_STATE_COND(patch_control_points,
|
|
TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
|
|
cmd->state.dirty & TU_CMD_DIRTY_PROGRAM,
|
|
cmd->state.shaders[MESA_SHADER_VERTEX],
|
|
cmd->state.shaders[MESA_SHADER_TESS_CTRL],
|
|
cmd->state.shaders[MESA_SHADER_TESS_EVAL],
|
|
&cmd->state.program,
|
|
cmd->vk.dynamic_graphics_state.ts.patch_control_points);
|
|
if (!cmd->device->physical_device->info->props.has_coherent_ubwc_flag_caches) {
|
|
DRAW_STATE_COND(prim_mode_sysmem,
|
|
TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
|
|
cmd->state.dirty & (TU_CMD_DIRTY_RAST_ORDER |
|
|
TU_CMD_DIRTY_FEEDBACK_LOOPS |
|
|
TU_CMD_DIRTY_FS),
|
|
cmd->state.shaders[MESA_SHADER_FRAGMENT],
|
|
cmd->state.raster_order_attachment_access,
|
|
cmd->vk.dynamic_graphics_state.feedback_loops |
|
|
cmd->state.pipeline_feedback_loops,
|
|
&cmd->state.rp.sysmem_single_prim_mode);
|
|
}
|
|
#undef DRAW_STATE
|
|
#undef DRAW_STATE_COND
|
|
#undef EMIT_STATE
|
|
|
|
return dirty_draw_states;
|
|
}
|
|
TU_GENX(tu_emit_draw_state);
|
|
|
|
static void
|
|
tu_pipeline_builder_parse_depth_stencil(
|
|
struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
|
|
{
|
|
const VkPipelineDepthStencilStateCreateInfo *ds_info =
|
|
builder->create_info->pDepthStencilState;
|
|
|
|
if ((builder->graphics_state.rp->attachments ==
|
|
MESA_VK_RP_ATTACHMENT_INFO_INVALID) ||
|
|
(builder->graphics_state.rp->attachments &
|
|
MESA_VK_RP_ATTACHMENT_DEPTH_BIT)) {
|
|
pipeline->ds.raster_order_attachment_access =
|
|
ds_info && (ds_info->flags &
|
|
(VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT |
|
|
VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT));
|
|
}
|
|
}
|
|
|
|
static void
|
|
tu_pipeline_builder_parse_multisample_and_color_blend(
|
|
struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
|
|
{
|
|
/* The spec says:
|
|
*
|
|
* pMultisampleState is a pointer to an instance of the
|
|
* VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
|
|
* has rasterization disabled.
|
|
*
|
|
* Also,
|
|
*
|
|
* pColorBlendState is a pointer to an instance of the
|
|
* VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
|
|
* pipeline has rasterization disabled or if the subpass of the render
|
|
* pass the pipeline is created against does not use any color
|
|
* attachments.
|
|
*
|
|
* We leave the relevant registers stale when rasterization is disabled.
|
|
*/
|
|
if (builder->rasterizer_discard) {
|
|
return;
|
|
}
|
|
|
|
static const VkPipelineColorBlendStateCreateInfo dummy_blend_info = {};
|
|
|
|
const VkPipelineColorBlendStateCreateInfo *blend_info =
|
|
(builder->graphics_state.rp->attachments &
|
|
MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)
|
|
? builder->create_info->pColorBlendState
|
|
: &dummy_blend_info;
|
|
|
|
if (builder->graphics_state.rp->attachments &
|
|
MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS) {
|
|
pipeline->output.raster_order_attachment_access =
|
|
blend_info && (blend_info->flags &
|
|
VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_EXT);
|
|
}
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static void
|
|
tu_pipeline_builder_parse_rasterization_order(
|
|
struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
|
|
{
|
|
if (builder->rasterizer_discard)
|
|
return;
|
|
|
|
bool raster_order_attachment_access =
|
|
pipeline->output.raster_order_attachment_access ||
|
|
pipeline->ds.raster_order_attachment_access ||
|
|
TU_DEBUG(RAST_ORDER);
|
|
|
|
/* VK_EXT_blend_operation_advanced would also require ordered access
|
|
* when implemented in the future.
|
|
*/
|
|
|
|
enum a6xx_single_prim_mode gmem_prim_mode = NO_FLUSH;
|
|
|
|
if (raster_order_attachment_access) {
|
|
/* VK_EXT_rasterization_order_attachment_access:
|
|
*
|
|
* This extension allow access to framebuffer attachments when used as
|
|
* both input and color attachments from one fragment to the next,
|
|
* in rasterization order, without explicit synchronization.
|
|
*/
|
|
gmem_prim_mode = FLUSH_PER_OVERLAP;
|
|
}
|
|
|
|
struct tu_cs cs;
|
|
|
|
pipeline->prim_order.state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
|
|
tu_cs_emit_regs(&cs, GRAS_SC_CNTL(CHIP,
|
|
.single_prim_mode = gmem_prim_mode,
|
|
.ccusinglecachelinesize = 2,
|
|
));
|
|
}
|
|
|
|
static void
|
|
tu_pipeline_finish(struct tu_pipeline *pipeline,
|
|
struct tu_device *dev,
|
|
const VkAllocationCallbacks *alloc)
|
|
{
|
|
tu_cs_finish(&pipeline->cs);
|
|
TU_RMV(resource_destroy, dev, &pipeline->bo);
|
|
|
|
mtx_lock(&dev->pipeline_mutex);
|
|
tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
|
|
mtx_unlock(&dev->pipeline_mutex);
|
|
|
|
if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
|
|
struct tu_graphics_lib_pipeline *library =
|
|
tu_pipeline_to_graphics_lib(pipeline);
|
|
|
|
if (library->nir_shaders)
|
|
vk_pipeline_cache_object_unref(&dev->vk,
|
|
&library->nir_shaders->base);
|
|
|
|
for (unsigned i = 0; i < library->num_sets; i++) {
|
|
if (library->layouts[i])
|
|
vk_descriptor_set_layout_unref(&dev->vk, &library->layouts[i]->vk);
|
|
}
|
|
|
|
vk_free2(&dev->vk.alloc, alloc, library->state_data);
|
|
}
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
|
|
if (pipeline->shaders[i])
|
|
vk_pipeline_cache_object_unref(&dev->vk,
|
|
&pipeline->shaders[i]->base);
|
|
}
|
|
|
|
ralloc_free(pipeline->executables_mem_ctx);
|
|
}
|
|
|
|
static VkGraphicsPipelineLibraryFlagBitsEXT
|
|
vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)
|
|
{
|
|
assert(util_bitcount(stage) == 1);
|
|
switch (stage) {
|
|
case VK_SHADER_STAGE_VERTEX_BIT:
|
|
case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
|
|
case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
|
|
case VK_SHADER_STAGE_GEOMETRY_BIT:
|
|
case VK_SHADER_STAGE_TASK_BIT_EXT:
|
|
case VK_SHADER_STAGE_MESH_BIT_EXT:
|
|
return VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT;
|
|
case VK_SHADER_STAGE_FRAGMENT_BIT:
|
|
return VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT;
|
|
default:
|
|
UNREACHABLE("Invalid shader stage");
|
|
}
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static VkResult
|
|
tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
|
|
struct tu_pipeline **pipeline)
|
|
{
|
|
VkResult result;
|
|
|
|
if (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) {
|
|
*pipeline = (struct tu_pipeline *) vk_object_zalloc(
|
|
&builder->device->vk, builder->alloc,
|
|
sizeof(struct tu_graphics_lib_pipeline),
|
|
VK_OBJECT_TYPE_PIPELINE);
|
|
if (!*pipeline)
|
|
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
|
(*pipeline)->type = TU_PIPELINE_GRAPHICS_LIB;
|
|
} else {
|
|
*pipeline = (struct tu_pipeline *) vk_object_zalloc(
|
|
&builder->device->vk, builder->alloc,
|
|
sizeof(struct tu_graphics_pipeline),
|
|
VK_OBJECT_TYPE_PIPELINE);
|
|
if (!*pipeline)
|
|
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
|
(*pipeline)->type = TU_PIPELINE_GRAPHICS;
|
|
}
|
|
|
|
(*pipeline)->executables_mem_ctx = ralloc_context(NULL);
|
|
util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
|
|
|
|
tu_pipeline_builder_parse_libraries(builder, *pipeline);
|
|
|
|
VkShaderStageFlags stages = 0;
|
|
for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
|
|
VkShaderStageFlagBits stage = builder->create_info->pStages[i].stage;
|
|
|
|
/* Ignore shader stages that don't need to be imported. */
|
|
if (!(vk_shader_stage_to_pipeline_library_flags(stage) & builder->state))
|
|
continue;
|
|
|
|
stages |= stage;
|
|
}
|
|
builder->active_stages = stages;
|
|
|
|
(*pipeline)->active_stages = stages;
|
|
for (unsigned i = 0; i < builder->num_libraries; i++)
|
|
(*pipeline)->active_stages |= builder->libraries[i]->base.active_stages;
|
|
|
|
/* Compile and upload shaders unless a library has already done that. */
|
|
if ((*pipeline)->program.vs_state.size == 0) {
|
|
tu_pipeline_builder_parse_layout(builder, *pipeline);
|
|
|
|
result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
|
|
if (result != VK_SUCCESS) {
|
|
tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
|
|
vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
|
|
return result;
|
|
}
|
|
}
|
|
|
|
result = tu_pipeline_allocate_cs(builder->device, *pipeline,
|
|
&builder->layout, builder, NULL);
|
|
|
|
|
|
if (set_combined_state(builder, *pipeline,
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
|
|
if (result != VK_SUCCESS) {
|
|
vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
|
|
return result;
|
|
}
|
|
|
|
tu_emit_program_state<CHIP>(&(*pipeline)->cs, &(*pipeline)->program,
|
|
(*pipeline)->shaders);
|
|
|
|
if (CHIP == A6XX) {
|
|
/* Blob doesn't preload state on A7XX, likely preloading either
|
|
* doesn't work or doesn't provide benefits.
|
|
*/
|
|
tu6_emit_load_state(builder->device, *pipeline, &builder->layout);
|
|
}
|
|
}
|
|
|
|
if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
|
|
tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
|
|
}
|
|
|
|
if (builder->state &
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
|
|
tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
|
|
}
|
|
|
|
if (set_combined_state(builder, *pipeline,
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
|
|
tu_pipeline_builder_parse_rasterization_order<CHIP>(builder, *pipeline);
|
|
}
|
|
|
|
tu_pipeline_builder_emit_state<CHIP>(builder, *pipeline);
|
|
|
|
if ((*pipeline)->type == TU_PIPELINE_GRAPHICS_LIB) {
|
|
struct tu_graphics_lib_pipeline *library =
|
|
tu_pipeline_to_graphics_lib(*pipeline);
|
|
result = vk_graphics_pipeline_state_copy(&builder->device->vk,
|
|
&library->graphics_state,
|
|
&builder->graphics_state,
|
|
builder->alloc,
|
|
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
|
|
&library->state_data);
|
|
if (result != VK_SUCCESS) {
|
|
tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
|
|
return result;
|
|
}
|
|
} else {
|
|
struct tu_graphics_pipeline *gfx_pipeline =
|
|
tu_pipeline_to_graphics(*pipeline);
|
|
gfx_pipeline->dynamic_state.ms.sample_locations =
|
|
&gfx_pipeline->sample_locations;
|
|
vk_dynamic_graphics_state_fill(&gfx_pipeline->dynamic_state,
|
|
&builder->graphics_state);
|
|
gfx_pipeline->feedback_loops =
|
|
vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags);
|
|
gfx_pipeline->feedback_loop_may_involve_textures =
|
|
builder->graphics_state.feedback_loop_not_input_only;
|
|
}
|
|
|
|
return VK_SUCCESS;
|
|
}
|
|
|
|
static void
|
|
tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
|
|
{
|
|
ralloc_free(builder->mem_ctx);
|
|
}
|
|
|
|
void
|
|
tu_fill_render_pass_state(struct vk_render_pass_state *rp,
|
|
const struct tu_render_pass *pass,
|
|
const struct tu_subpass *subpass)
|
|
{
|
|
rp->view_mask = subpass->multiview_mask;
|
|
rp->color_attachment_count = subpass->color_count;
|
|
|
|
const uint32_t a = subpass->depth_stencil_attachment.attachment;
|
|
rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
|
|
rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
|
|
rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
|
|
if (a != VK_ATTACHMENT_UNUSED) {
|
|
VkFormat ds_format = pass->attachments[a].format;
|
|
if (vk_format_has_depth(ds_format) && subpass->depth_used) {
|
|
rp->depth_attachment_format = ds_format;
|
|
rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
|
|
}
|
|
if (vk_format_has_stencil(ds_format) && subpass->stencil_used) {
|
|
rp->stencil_attachment_format = ds_format;
|
|
rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
|
|
}
|
|
}
|
|
|
|
for (uint32_t i = 0; i < subpass->color_count; i++) {
|
|
const uint32_t a = subpass->color_attachments[i].attachment;
|
|
if (a == VK_ATTACHMENT_UNUSED) {
|
|
rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
|
|
continue;
|
|
}
|
|
|
|
rp->color_attachment_formats[i] = pass->attachments[a].format;
|
|
rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
|
|
}
|
|
|
|
rp->custom_resolve = subpass->custom_resolve;
|
|
}
|
|
|
|
static void
|
|
tu_pipeline_builder_init_graphics(
|
|
struct tu_pipeline_builder *builder,
|
|
struct tu_device *dev,
|
|
struct vk_pipeline_cache *cache,
|
|
const VkGraphicsPipelineCreateInfo *create_info,
|
|
VkPipelineCreateFlags2KHR flags,
|
|
const VkAllocationCallbacks *alloc)
|
|
{
|
|
*builder = (struct tu_pipeline_builder) {
|
|
.device = dev,
|
|
.mem_ctx = ralloc_context(NULL),
|
|
.cache = cache,
|
|
.alloc = alloc,
|
|
.create_info = create_info,
|
|
.create_flags = flags,
|
|
};
|
|
|
|
const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info =
|
|
vk_find_struct_const(builder->create_info->pNext,
|
|
GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT);
|
|
|
|
const VkPipelineLibraryCreateInfoKHR *library_info =
|
|
vk_find_struct_const(builder->create_info->pNext,
|
|
PIPELINE_LIBRARY_CREATE_INFO_KHR);
|
|
|
|
if (gpl_info) {
|
|
builder->state = gpl_info->flags;
|
|
} else {
|
|
/* Implement this bit of spec text:
|
|
*
|
|
* If this structure is omitted, and either
|
|
* VkGraphicsPipelineCreateInfo::flags includes
|
|
* VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the
|
|
* VkGraphicsPipelineCreateInfo::pNext chain includes a
|
|
* VkPipelineLibraryCreateInfoKHR structure with a libraryCount
|
|
* greater than 0, it is as if flags is 0. Otherwise if this
|
|
* structure is omitted, it is as if flags includes all possible
|
|
* subsets of the graphics pipeline (i.e. a complete graphics
|
|
* pipeline).
|
|
*/
|
|
if ((library_info && library_info->libraryCount > 0) ||
|
|
(builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR)) {
|
|
builder->state = 0;
|
|
} else {
|
|
builder->state =
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT;
|
|
}
|
|
}
|
|
|
|
bool rasterizer_discard_dynamic = false;
|
|
if (create_info->pDynamicState) {
|
|
for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
|
|
if (create_info->pDynamicState->pDynamicStates[i] ==
|
|
VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
|
|
rasterizer_discard_dynamic = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
builder->rasterizer_discard =
|
|
(builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) &&
|
|
!rasterizer_discard_dynamic &&
|
|
builder->create_info->pRasterizationState->rasterizerDiscardEnable;
|
|
|
|
struct vk_render_pass_state rp_state = {};
|
|
const struct vk_render_pass_state *driver_rp = NULL;
|
|
VkPipelineCreateFlags2KHR rp_flags = 0;
|
|
|
|
builder->unscaled_input_fragcoord = 0;
|
|
|
|
/* Extract information we need from the turnip renderpass. This will be
|
|
* filled out automatically if the app is using dynamic rendering or
|
|
* renderpasses are emulated.
|
|
*/
|
|
if (!TU_DEBUG(DYNAMIC) &&
|
|
(builder->state &
|
|
(VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
|
|
VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) &&
|
|
builder->create_info->renderPass) {
|
|
const struct tu_render_pass *pass =
|
|
tu_render_pass_from_handle(create_info->renderPass);
|
|
const struct tu_subpass *subpass =
|
|
&pass->subpasses[create_info->subpass];
|
|
|
|
tu_fill_render_pass_state(&rp_state, pass, subpass);
|
|
|
|
for (unsigned i = 0; i < subpass->input_count; i++) {
|
|
/* Input attachments stored in GMEM must be loaded with unscaled
|
|
* FragCoord.
|
|
*/
|
|
if (subpass->input_attachments[i].patch_input_gmem)
|
|
builder->unscaled_input_fragcoord |= 1u << i;
|
|
}
|
|
|
|
if (subpass->feedback_loop_color) {
|
|
rp_flags |=
|
|
VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
|
|
}
|
|
|
|
if (subpass->feedback_loop_ds) {
|
|
rp_flags |=
|
|
VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
|
|
}
|
|
|
|
if (pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
|
|
rp_flags |=
|
|
VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT;
|
|
}
|
|
|
|
if (subpass->fsr_attachment != VK_ATTACHMENT_UNUSED) {
|
|
rp_flags |=
|
|
VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
|
|
}
|
|
|
|
if (pass->has_layered_fdm) {
|
|
rp_flags |=
|
|
VK_PIPELINE_CREATE_2_PER_LAYER_FRAGMENT_DENSITY_BIT_VALVE;
|
|
}
|
|
|
|
builder->unscaled_input_fragcoord = 0;
|
|
for (unsigned i = 0; i < subpass->input_count; i++) {
|
|
/* Input attachments stored in GMEM must be loaded with unscaled
|
|
* FragCoord.
|
|
*/
|
|
if (subpass->input_attachments[i].patch_input_gmem)
|
|
builder->unscaled_input_fragcoord |= 1u << i;
|
|
}
|
|
|
|
driver_rp = &rp_state;
|
|
}
|
|
|
|
vk_graphics_pipeline_state_fill(&dev->vk,
|
|
&builder->graphics_state,
|
|
builder->create_info,
|
|
driver_rp,
|
|
rp_flags,
|
|
&builder->all_state,
|
|
NULL, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
|
|
NULL);
|
|
|
|
if (builder->graphics_state.rp) {
|
|
builder->fragment_density_map = (builder->graphics_state.pipeline_flags &
|
|
VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT) ||
|
|
TU_DEBUG(FDM);
|
|
builder->fdm_per_layer = (builder->graphics_state.pipeline_flags &
|
|
VK_PIPELINE_CREATE_2_PER_LAYER_FRAGMENT_DENSITY_BIT_VALVE);
|
|
if (builder->fdm_per_layer) {
|
|
const VkPipelineFragmentDensityMapLayeredCreateInfoVALVE *fdm_layered_info =
|
|
vk_find_struct_const(create_info->pNext,
|
|
PIPELINE_FRAGMENT_DENSITY_MAP_LAYERED_CREATE_INFO_VALVE);
|
|
if (fdm_layered_info) {
|
|
builder->max_fdm_layers =
|
|
fdm_layered_info->maxFragmentDensityMapLayers;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <chip CHIP>
|
|
static VkResult
|
|
tu_graphics_pipeline_create(VkDevice device,
|
|
VkPipelineCache pipelineCache,
|
|
const VkGraphicsPipelineCreateInfo *pCreateInfo,
|
|
VkPipelineCreateFlags2KHR flags,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
VkPipeline *pPipeline)
|
|
{
|
|
VK_FROM_HANDLE(tu_device, dev, device);
|
|
VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
|
|
|
|
cache = cache ? cache : dev->mem_cache;
|
|
|
|
struct tu_pipeline_builder builder;
|
|
tu_pipeline_builder_init_graphics(&builder, dev, cache,
|
|
pCreateInfo, flags, pAllocator);
|
|
|
|
struct tu_pipeline *pipeline = NULL;
|
|
VkResult result = tu_pipeline_builder_build<CHIP>(&builder, &pipeline);
|
|
tu_pipeline_builder_finish(&builder);
|
|
|
|
if (result == VK_SUCCESS) {
|
|
TU_RMV(graphics_pipeline_create, dev, tu_pipeline_to_graphics(pipeline));
|
|
|
|
*pPipeline = tu_pipeline_to_handle(pipeline);
|
|
} else
|
|
*pPipeline = VK_NULL_HANDLE;
|
|
|
|
return result;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
tu_CreateGraphicsPipelines(VkDevice device,
|
|
VkPipelineCache pipelineCache,
|
|
uint32_t count,
|
|
const VkGraphicsPipelineCreateInfo *pCreateInfos,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
VkPipeline *pPipelines)
|
|
{
|
|
MESA_TRACE_FUNC();
|
|
VkResult final_result = VK_SUCCESS;
|
|
uint32_t i = 0;
|
|
|
|
for (; i < count; i++) {
|
|
VkPipelineCreateFlags2KHR flags =
|
|
vk_graphics_pipeline_create_flags(&pCreateInfos[i]);
|
|
|
|
VkResult result =
|
|
tu_graphics_pipeline_create<CHIP>(device, pipelineCache,
|
|
&pCreateInfos[i], flags,
|
|
pAllocator, &pPipelines[i]);
|
|
|
|
if (result != VK_SUCCESS) {
|
|
final_result = result;
|
|
pPipelines[i] = VK_NULL_HANDLE;
|
|
|
|
if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (; i < count; i++)
|
|
pPipelines[i] = VK_NULL_HANDLE;
|
|
|
|
return final_result;
|
|
}
|
|
TU_GENX(tu_CreateGraphicsPipelines);
|
|
|
|
template <chip CHIP>
|
|
static VkResult
|
|
tu_compute_pipeline_create(VkDevice device,
|
|
VkPipelineCache pipelineCache,
|
|
const VkComputePipelineCreateInfo *pCreateInfo,
|
|
VkPipelineCreateFlags2KHR flags,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
VkPipeline *pPipeline)
|
|
{
|
|
VK_FROM_HANDLE(tu_device, dev, device);
|
|
VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
|
|
VK_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
|
|
const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
|
|
VkResult result;
|
|
const struct ir3_shader_variant *v = NULL;
|
|
|
|
cache = cache ? cache : dev->mem_cache;
|
|
|
|
struct tu_compute_pipeline *pipeline;
|
|
|
|
*pPipeline = VK_NULL_HANDLE;
|
|
|
|
VkPipelineCreationFeedback pipeline_feedback = {
|
|
.flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
|
|
};
|
|
|
|
const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
|
|
vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
|
|
|
|
int64_t pipeline_start = os_time_get_nano();
|
|
|
|
pipeline = (struct tu_compute_pipeline *) vk_object_zalloc(
|
|
&dev->vk, pAllocator, sizeof(*pipeline), VK_OBJECT_TYPE_PIPELINE);
|
|
if (!pipeline)
|
|
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
|
pipeline->base.type = TU_PIPELINE_COMPUTE;
|
|
|
|
pipeline->base.executables_mem_ctx = ralloc_context(NULL);
|
|
util_dynarray_init(&pipeline->base.executables, pipeline->base.executables_mem_ctx);
|
|
pipeline->base.active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
|
|
|
|
struct tu_shader_key key = { };
|
|
bool allow_varying_subgroup_size =
|
|
(stage_info->flags &
|
|
VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
|
|
bool require_full_subgroups =
|
|
stage_info->flags &
|
|
VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT;
|
|
const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info =
|
|
vk_find_struct_const(stage_info,
|
|
PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
|
|
tu_shader_key_subgroup_size(&key, allow_varying_subgroup_size,
|
|
require_full_subgroups, subgroup_info,
|
|
dev);
|
|
|
|
struct vk_pipeline_robustness_state rs;
|
|
vk_pipeline_robustness_state_fill(&dev->vk, &rs,
|
|
pCreateInfo->pNext,
|
|
stage_info->pNext);
|
|
tu_shader_key_robustness(&key, &rs);
|
|
|
|
void *pipeline_mem_ctx = ralloc_context(NULL);
|
|
|
|
unsigned char pipeline_sha1[20];
|
|
tu_hash_compute(pipeline_sha1, flags, stage_info, layout, &key);
|
|
|
|
struct tu_shader *shader = NULL;
|
|
|
|
const bool executable_info = flags &
|
|
VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
|
|
|
|
bool application_cache_hit = false;
|
|
|
|
if (!executable_info) {
|
|
shader =
|
|
tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
|
|
&application_cache_hit);
|
|
}
|
|
|
|
if (application_cache_hit && cache != dev->mem_cache) {
|
|
pipeline_feedback.flags |=
|
|
VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
|
|
}
|
|
|
|
char *nir_initial_disasm = NULL;
|
|
|
|
if (!shader) {
|
|
if (flags &
|
|
VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
|
|
result = VK_PIPELINE_COMPILE_REQUIRED;
|
|
goto fail;
|
|
}
|
|
|
|
struct ir3_shader_key ir3_key = {};
|
|
|
|
nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, flags,
|
|
stage_info, &key, MESA_SHADER_COMPUTE);
|
|
|
|
nir_initial_disasm = executable_info ?
|
|
nir_shader_as_str(nir, pipeline->base.executables_mem_ctx) : NULL;
|
|
|
|
result = tu_shader_create(dev, &shader, nir, &key, &ir3_key,
|
|
pipeline_sha1, sizeof(pipeline_sha1), layout,
|
|
executable_info);
|
|
if (!shader) {
|
|
goto fail;
|
|
}
|
|
|
|
shader = tu_pipeline_cache_insert(cache, shader);
|
|
}
|
|
|
|
pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
|
|
|
|
if (creation_feedback) {
|
|
*creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
|
|
if (creation_feedback->pipelineStageCreationFeedbackCount > 0) {
|
|
assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
|
|
creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
|
|
}
|
|
}
|
|
|
|
pipeline->base.active_desc_sets = shader->active_desc_sets;
|
|
|
|
v = shader->variant;
|
|
|
|
tu_pipeline_set_linkage(&pipeline->base.program.link[MESA_SHADER_COMPUTE],
|
|
&shader->const_state, v);
|
|
|
|
result = tu_pipeline_allocate_cs(dev, &pipeline->base, layout, NULL, v);
|
|
if (result != VK_SUCCESS)
|
|
goto fail;
|
|
|
|
for (int i = 0; i < 3; i++)
|
|
pipeline->local_size[i] = v->local_size[i];
|
|
|
|
if (CHIP == A6XX) {
|
|
tu6_emit_load_state(dev, &pipeline->base, layout);
|
|
}
|
|
|
|
tu_append_executable(&pipeline->base, v, nir_initial_disasm);
|
|
|
|
pipeline->instrlen = v->instrlen;
|
|
|
|
pipeline->base.shaders[MESA_SHADER_COMPUTE] = shader;
|
|
|
|
ralloc_free(pipeline_mem_ctx);
|
|
|
|
TU_RMV(compute_pipeline_create, dev, pipeline);
|
|
|
|
*pPipeline = tu_pipeline_to_handle(&pipeline->base);
|
|
|
|
return VK_SUCCESS;
|
|
|
|
fail:
|
|
if (shader)
|
|
vk_pipeline_cache_object_unref(&dev->vk, &shader->base);
|
|
|
|
ralloc_free(pipeline->base.executables_mem_ctx);
|
|
ralloc_free(pipeline_mem_ctx);
|
|
|
|
vk_object_free(&dev->vk, pAllocator, pipeline);
|
|
|
|
return result;
|
|
}
|
|
|
|
template <chip CHIP>
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
tu_CreateComputePipelines(VkDevice device,
|
|
VkPipelineCache pipelineCache,
|
|
uint32_t count,
|
|
const VkComputePipelineCreateInfo *pCreateInfos,
|
|
const VkAllocationCallbacks *pAllocator,
|
|
VkPipeline *pPipelines)
|
|
{
|
|
MESA_TRACE_FUNC();
|
|
VkResult final_result = VK_SUCCESS;
|
|
uint32_t i = 0;
|
|
|
|
for (; i < count; i++) {
|
|
VkPipelineCreateFlags2KHR flags =
|
|
vk_compute_pipeline_create_flags(&pCreateInfos[i]);
|
|
|
|
VkResult result =
|
|
tu_compute_pipeline_create<CHIP>(device, pipelineCache,
|
|
&pCreateInfos[i], flags,
|
|
pAllocator, &pPipelines[i]);
|
|
if (result != VK_SUCCESS) {
|
|
final_result = result;
|
|
pPipelines[i] = VK_NULL_HANDLE;
|
|
|
|
if (flags &
|
|
VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (; i < count; i++)
|
|
pPipelines[i] = VK_NULL_HANDLE;
|
|
|
|
return final_result;
|
|
}
|
|
TU_GENX(tu_CreateComputePipelines);
|
|
|
|
VKAPI_ATTR void VKAPI_CALL
|
|
tu_DestroyPipeline(VkDevice _device,
|
|
VkPipeline _pipeline,
|
|
const VkAllocationCallbacks *pAllocator)
|
|
{
|
|
VK_FROM_HANDLE(tu_device, dev, _device);
|
|
VK_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
|
|
|
|
if (!_pipeline)
|
|
return;
|
|
|
|
TU_RMV(resource_destroy, dev, pipeline);
|
|
|
|
tu_pipeline_finish(pipeline, dev, pAllocator);
|
|
vk_object_free(&dev->vk, pAllocator, pipeline);
|
|
}
|
|
|
|
static const struct tu_pipeline_executable *
|
|
tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
|
|
{
|
|
assert(index < util_dynarray_num_elements(&pipeline->executables,
|
|
struct tu_pipeline_executable));
|
|
return util_dynarray_element(
|
|
&pipeline->executables, struct tu_pipeline_executable, index);
|
|
}
|
|
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
tu_GetPipelineExecutablePropertiesKHR(
|
|
VkDevice _device,
|
|
const VkPipelineInfoKHR* pPipelineInfo,
|
|
uint32_t* pExecutableCount,
|
|
VkPipelineExecutablePropertiesKHR* pProperties)
|
|
{
|
|
VK_FROM_HANDLE(tu_device, dev, _device);
|
|
VK_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
|
|
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
|
|
pProperties, pExecutableCount);
|
|
|
|
util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
|
|
vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
|
|
mesa_shader_stage stage = exe->stage;
|
|
props->stages = mesa_to_vk_shader_stage(stage);
|
|
|
|
if (!exe->is_binning)
|
|
VK_COPY_STR(props->name, _mesa_shader_stage_to_abbrev(stage));
|
|
else
|
|
VK_COPY_STR(props->name, "Binning VS");
|
|
|
|
VK_COPY_STR(props->description, _mesa_shader_stage_to_string(stage));
|
|
|
|
props->subgroupSize =
|
|
dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
|
|
}
|
|
}
|
|
|
|
return vk_outarray_status(&out);
|
|
}
|
|
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
tu_GetPipelineExecutableStatisticsKHR(
|
|
VkDevice _device,
|
|
const VkPipelineExecutableInfoKHR* pExecutableInfo,
|
|
uint32_t* pStatisticCount,
|
|
VkPipelineExecutableStatisticKHR* pStatistics)
|
|
{
|
|
VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
|
|
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
|
|
pStatistics, pStatisticCount);
|
|
|
|
const struct tu_pipeline_executable *exe =
|
|
tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
|
|
|
|
struct adreno_stats stats;
|
|
stats.maxwaves = exe->stats.max_waves;
|
|
stats.inst = exe->stats.instrs_count;
|
|
stats.code_size = exe->stats.sizedwords;
|
|
stats.nops = exe->stats.nops_count;
|
|
stats.mov = exe->stats.mov_count;
|
|
stats.cov = exe->stats.cov_count;
|
|
stats.full = exe->stats.max_reg + 1;
|
|
stats.half = exe->stats.max_half_reg + 1;
|
|
stats.last_baryf = exe->stats.last_baryf;
|
|
stats.last_helper = exe->stats.last_helper;
|
|
stats.ss = exe->stats.ss;
|
|
stats.sy = exe->stats.sy;
|
|
stats.ss_stall = exe->stats.sstall;
|
|
stats.sy_stall = exe->stats.systall;
|
|
stats.loops = exe->stats.loops;
|
|
stats.stps = exe->stats.stp_count;
|
|
stats.ldps = exe->stats.ldp_count;
|
|
stats.preamble_inst = exe->stats.preamble_instrs_count;
|
|
stats.early_preamble = exe->stats.early_preamble;
|
|
stats.constlen = exe->stats.constlen;
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); ++i) {
|
|
stats.cat[i] = exe->stats.instrs_per_cat[i];
|
|
}
|
|
|
|
vk_add_adreno_stats(out, &stats);
|
|
return vk_outarray_status(&out);
|
|
}
|
|
|
|
static bool
|
|
write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
|
|
const char *data)
|
|
{
|
|
ir->isText = VK_TRUE;
|
|
|
|
size_t data_len = strlen(data) + 1;
|
|
|
|
if (ir->pData == NULL) {
|
|
ir->dataSize = data_len;
|
|
return true;
|
|
}
|
|
|
|
strncpy((char *) ir->pData, data, ir->dataSize);
|
|
if (ir->dataSize < data_len)
|
|
return false;
|
|
|
|
ir->dataSize = data_len;
|
|
return true;
|
|
}
|
|
|
|
VKAPI_ATTR VkResult VKAPI_CALL
|
|
tu_GetPipelineExecutableInternalRepresentationsKHR(
|
|
VkDevice _device,
|
|
const VkPipelineExecutableInfoKHR* pExecutableInfo,
|
|
uint32_t* pInternalRepresentationCount,
|
|
VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
|
|
{
|
|
VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
|
|
VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
|
|
pInternalRepresentations, pInternalRepresentationCount);
|
|
bool incomplete_text = false;
|
|
|
|
const struct tu_pipeline_executable *exe =
|
|
tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
|
|
|
|
if (exe->nir_from_spirv) {
|
|
vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
|
|
VK_COPY_STR(ir->name, "NIR from SPIRV");
|
|
VK_COPY_STR(ir->description, "Initial NIR before any optimizations");
|
|
|
|
if (!write_ir_text(ir, exe->nir_from_spirv))
|
|
incomplete_text = true;
|
|
}
|
|
}
|
|
|
|
if (exe->nir_final) {
|
|
vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
|
|
VK_COPY_STR(ir->name, "Final NIR");
|
|
VK_COPY_STR(ir->description,
|
|
"Final NIR before going into the back-end compiler");
|
|
|
|
if (!write_ir_text(ir, exe->nir_final))
|
|
incomplete_text = true;
|
|
}
|
|
}
|
|
|
|
if (exe->disasm) {
|
|
vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
|
|
VK_COPY_STR(ir->name, "IR3 Assembly");
|
|
VK_COPY_STR(ir->description,
|
|
"Final IR3 assembly for the generated shader binary");
|
|
|
|
if (!write_ir_text(ir, exe->disasm))
|
|
incomplete_text = true;
|
|
}
|
|
}
|
|
|
|
return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
|
|
}
|