pvr, pco: add base compute support

Signed-off-by: Simon Perretta <simon.perretta@imgtec.com>
Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36412>
This commit is contained in:
Simon Perretta 2025-01-01 22:11:21 +00:00 committed by Marge Bot
parent ad2b623744
commit 4f79bc2e30
12 changed files with 778 additions and 421 deletions

View file

@ -16,6 +16,7 @@ libpowervr_compiler_files = files(
'pco_ir.c',
'pco_legalize.c',
'pco_nir.c',
'pco_nir_compute.c',
'pco_nir_pvfio.c',
'pco_nir_vk.c',
'pco_opt.c',

View file

@ -71,7 +71,7 @@ typedef struct _pco_fs_data {
/** PCO compute shader-specific data. */
typedef struct _pco_cs_data {
/**/
unsigned workgroup_size[3]; /** Workgroup size. */
} pco_cs_data;
/** PCO descriptor binding data. */

View file

@ -1531,6 +1531,7 @@ bool pco_end(pco_shader *shader);
bool pco_group_instrs(pco_shader *shader);
bool pco_index(pco_shader *shader, bool skip_ssa);
bool pco_legalize(pco_shader *shader);
bool pco_nir_compute_instance_check(nir_shader *shader);
bool pco_nir_lower_algebraic(nir_shader *shader);
bool pco_nir_lower_algebraic_late(nir_shader *shader);
bool pco_nir_lower_vk(nir_shader *shader, pco_common_data *common);

View file

@ -81,6 +81,9 @@ const nir_shader_compiler_options *pco_nir_options(void)
*/
void pco_preprocess_nir(pco_ctx *ctx, nir_shader *nir)
{
if (nir->info.stage == MESA_SHADER_COMPUTE)
NIR_PASS(_, nir, pco_nir_compute_instance_check);
if (nir->info.internal)
NIR_PASS(_, nir, nir_lower_returns);
@ -108,6 +111,15 @@ void pco_preprocess_nir(pco_ctx *ctx, nir_shader *nir)
NIR_PASS(_, nir, nir_lower_system_values);
if (nir->info.stage == MESA_SHADER_COMPUTE) {
NIR_PASS(_,
nir,
nir_lower_compute_system_values,
&(nir_lower_compute_system_values_options){
.lower_cs_local_id_to_index = true,
});
}
NIR_PASS(_,
nir,
nir_lower_io_vars_to_temporaries,
@ -396,6 +408,18 @@ static void gather_fs_data(nir_shader *nir, pco_data *data)
}
}
/**
* \brief Gathers compute shader data.
*
* \param[in] nir NIR shader.
* \param[in,out] data Shader data.
*/
static void gather_cs_data(nir_shader *nir, pco_data *data)
{
for (unsigned u = 0; u < ARRAY_SIZE(data->cs.workgroup_size); ++u)
data->cs.workgroup_size[u] = nir->info.workgroup_size[u];
}
/**
* \brief Checks whether a NIR intrinsic op is atomic.
*
@ -462,12 +486,16 @@ static void gather_data(nir_shader *nir, pco_data *data)
return gather_fs_data(nir, data);
case MESA_SHADER_VERTEX:
/* TODO */
break;
return;
case MESA_SHADER_COMPUTE:
return gather_cs_data(nir, data);
default:
UNREACHABLE("");
break;
}
UNREACHABLE("");
}
/**

View file

@ -0,0 +1,98 @@
/*
* Copyright © 2025 Imagination Technologies Ltd.
*
* SPDX-License-Identifier: MIT
*/
/**
* \file pco_nir_compute.c
*
* \brief PCO NIR compute-specific passes.
*/
#include "nir.h"
#include "nir_builder.h"
#include "pco.h"
#include "pco_builder.h"
#include "pco_internal.h"
#include "util/macros.h"
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#define INST_CHK_FUNC "@pco_inst_chk"
/**
* \brief Inserts the instance check.
*
* \param[in,out] shader NIR shader.
*/
static void insert_instance_check(nir_shader *shader)
{
/* Get original entrypoint. */
nir_function *orig_entrypoint = nir_shader_get_entrypoint(shader)->function;
/* Create a function for the instance check which will serve as the new
* entrypoint.
*/
nir_function *inst_chk_func = nir_function_create(shader, INST_CHK_FUNC);
inst_chk_func->is_entrypoint = true;
orig_entrypoint->is_entrypoint = false;
nir_builder b = nir_builder_create(nir_function_impl_create(inst_chk_func));
b.cursor = nir_after_cf_list(&b.impl->body);
/* If the current instance index is greater than the total workgroup size,
* we don't execute.
*/
nir_def *local_size = nir_load_workgroup_size(&b);
nir_def *size_x = nir_channel(&b, local_size, 0);
nir_def *size_y = nir_channel(&b, local_size, 1);
nir_def *size_z = nir_channel(&b, local_size, 2);
nir_def *flat_size = nir_imul(&b, nir_imul(&b, size_x, size_y), size_z);
nir_def *flat_id = nir_load_local_invocation_index(&b);
nir_def *cond_inst_valid = nir_ilt(&b, flat_id, flat_size);
nir_if *nif = nir_push_if(&b, cond_inst_valid);
{
nir_call(&b, orig_entrypoint);
}
nir_pop_if(&b, nif);
nir_jump(&b, nir_jump_return);
}
/**
* \brief Inserts an instance check for compute shaders.
*
* \param[in,out] shader NIR shader.
* \return True if the pass made progress.
*/
bool pco_nir_compute_instance_check(nir_shader *shader)
{
assert(shader->info.stage == MESA_SHADER_COMPUTE);
if (shader->info.internal)
return false;
/* Check we haven't already done this. */
nir_foreach_function (function, shader) {
if (function->name && !strcmp(function->name, INST_CHK_FUNC))
return false;
}
insert_instance_check(shader);
/* Re-inline. */
NIR_PASS(_, shader, nir_lower_variable_initializers, nir_var_function_temp);
NIR_PASS(_, shader, nir_lower_returns);
NIR_PASS(_, shader, nir_inline_functions);
NIR_PASS(_, shader, nir_copy_prop);
NIR_PASS(_, shader, nir_opt_deref);
nir_remove_non_entrypoints(shader);
NIR_PASS(_, shader, nir_lower_variable_initializers, ~0);
return true;
}

View file

@ -630,8 +630,36 @@ static pco_instr *trans_atomic_buffer(trans_ctx *tctx,
UNREACHABLE("");
}
static inline enum pco_reg_class sys_val_to_reg_class(gl_system_value sys_val,
mesa_shader_stage stage)
{
switch (stage) {
case MESA_SHADER_VERTEX:
return PCO_REG_CLASS_VTXIN;
case MESA_SHADER_COMPUTE:
switch (sys_val) {
case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX:
return PCO_REG_CLASS_VTXIN;
case SYSTEM_VALUE_WORKGROUP_ID:
case SYSTEM_VALUE_NUM_WORKGROUPS:
return PCO_REG_CLASS_COEFF;
default:
break;
}
break;
default:
break;
}
UNREACHABLE("");
}
/**
* \brief Translates a NIR vs load system value intrinsic into PCO.
* \brief Translates a NIR load system value intrinsic into PCO.
*
* \param[in,out] tctx Translation context.
* \param[in] intr System value intrinsic.
@ -639,7 +667,7 @@ static pco_instr *trans_atomic_buffer(trans_ctx *tctx,
* \return The translated PCO instruction.
*/
static pco_instr *
trans_load_sysval_vs(trans_ctx *tctx, nir_intrinsic_instr *intr, pco_ref dest)
trans_load_sysval(trans_ctx *tctx, nir_intrinsic_instr *intr, pco_ref dest)
{
gl_system_value sys_val = nir_system_value_from_intrinsic(intr->intrinsic);
const pco_range *range = &tctx->shader->data.common.sys_vals[sys_val];
@ -647,7 +675,9 @@ trans_load_sysval_vs(trans_ctx *tctx, nir_intrinsic_instr *intr, pco_ref dest)
unsigned chans = pco_ref_get_chans(dest);
assert(chans == range->count);
pco_ref src = pco_ref_hwreg_vec(range->start, PCO_REG_CLASS_VTXIN, chans);
pco_ref src = pco_ref_hwreg_vec(range->start,
sys_val_to_reg_class(sys_val, tctx->stage),
chans);
return pco_mov(&tctx->b, dest, src, .rpt = chans);
}
@ -702,12 +732,18 @@ static pco_instr *trans_intr(trans_ctx *tctx, nir_intrinsic_instr *intr)
instr = trans_atomic_buffer(tctx, intr, dest, src[1], src[2]);
break;
/* Vertex sysvals. */
case nir_intrinsic_load_vertex_id:
case nir_intrinsic_load_instance_id:
case nir_intrinsic_load_base_instance:
case nir_intrinsic_load_base_vertex:
case nir_intrinsic_load_draw_id:
instr = trans_load_sysval_vs(tctx, intr, dest);
/* Compute sysvals. */
case nir_intrinsic_load_local_invocation_index:
case nir_intrinsic_load_workgroup_id:
case nir_intrinsic_load_num_workgroups:
instr = trans_load_sysval(tctx, intr, dest);
break;
case nir_intrinsic_ddx:

View file

@ -57,7 +57,7 @@
*/
#define PVR_PDS_CDM_WORK_GROUP_ID_X 0
#define PVR_PDS_CDM_WORK_GROUP_ID_Y 1
#define PVR_PDS_CDM_WORK_GROUP_ID_Z 2
#define PVR_PDS_CDM_WORK_GROUP_ID_Z 3
/* Local IDs are available in every task. */
#define PVR_PDS_CDM_LOCAL_ID_X 0
#define PVR_PDS_CDM_LOCAL_ID_YZ 1
@ -91,6 +91,12 @@ static const uint32_t cache_control_const[2][2] = {
{ 0, 0 }
};
static const uint32_t wg_id_temps[3] = {
PVR_PDS_CDM_WORK_GROUP_ID_X,
PVR_PDS_CDM_WORK_GROUP_ID_Y,
PVR_PDS_CDM_WORK_GROUP_ID_Z,
};
/*****************************************************************************
Function definitions
*****************************************************************************/
@ -1768,6 +1774,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
* DOUTW for local, and two for global.
*/
uint32_t work_group_id_ctrl_words[2] = { 0 };
uint32_t num_work_groups_ctrl_words[2] = { 0 };
uint32_t local_id_ctrl_word = 0;
uint32_t local_input_register;
@ -1795,6 +1802,42 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE;
const bool has_local_input_regs =
(program->local_input_regs[0] != PVR_PDS_REG_UNUSED) ||
(program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
(program->local_input_regs[2] != PVR_PDS_REG_UNUSED);
const bool has_local_input_reg[3] = {
[0] = program->local_input_regs[0] != PVR_PDS_REG_UNUSED,
[1] = program->local_input_regs[1] != PVR_PDS_REG_UNUSED,
[2] = program->local_input_regs[2] != PVR_PDS_REG_UNUSED,
};
const bool has_work_group_input_regs =
(program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) ||
(program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) ||
(program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED);
const bool has_work_group_input_reg[3] = {
[0] = program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED,
[1] = program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED,
[2] = program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED,
};
const bool has_num_work_groups_regs =
(program->num_work_groups_regs[0] != PVR_PDS_REG_UNUSED) ||
(program->num_work_groups_regs[1] != PVR_PDS_REG_UNUSED) ||
(program->num_work_groups_regs[2] != PVR_PDS_REG_UNUSED);
const bool has_num_work_groups_reg[3] = {
[0] = program->num_work_groups_regs[0] != PVR_PDS_REG_UNUSED,
[1] = program->num_work_groups_regs[1] != PVR_PDS_REG_UNUSED,
[2] = program->num_work_groups_regs[2] != PVR_PDS_REG_UNUSED,
};
const bool has_barrier_coefficient = program->barrier_coefficient !=
PVR_PDS_REG_UNUSED;
if (program->kick_usc) {
/* Copy the USC task control words to constants. */
usc_control_constant64 =
@ -1822,13 +1865,12 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
program->cond_render_pred_temp = cond_render_pred_temp;
}
if ((program->barrier_coefficient != PVR_PDS_REG_UNUSED) ||
(program->clear_pds_barrier) ||
if (has_barrier_coefficient || program->clear_pds_barrier ||
(program->kick_usc && program->conditional_render)) {
zero_constant64 = pvr_pds_get_constants(&next_constant, 2, &data_size);
}
if (program->barrier_coefficient != PVR_PDS_REG_UNUSED) {
if (has_barrier_coefficient) {
barrier_ctrl_word = pvr_pds_get_constants(&next_constant, 1, &data_size);
if (PVR_HAS_QUIRK(dev_info, 51210)) {
barrier_ctrl_word2 =
@ -1836,35 +1878,51 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
}
}
if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED ||
program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) {
/* For DOUTW */
if (has_work_group_input_reg[0] || has_work_group_input_reg[1]) {
work_group_id_ctrl_words[0] =
pvr_pds_get_constants(&next_constant, 1, &data_size);
}
if (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED) {
if (has_work_group_input_reg[2]) {
work_group_id_ctrl_words[1] =
pvr_pds_get_constants(&next_constant, 1, &data_size);
}
if ((program->local_input_regs[0] != PVR_PDS_REG_UNUSED) ||
(program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
(program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) {
/* For DOUTW */
if (has_num_work_groups_reg[0] || has_num_work_groups_reg[1]) {
num_work_groups_ctrl_words[0] =
pvr_pds_get_constants(&next_constant, 1, &data_size);
}
if (has_num_work_groups_reg[2]) {
num_work_groups_ctrl_words[1] =
pvr_pds_get_constants(&next_constant, 1, &data_size);
}
/* For DOUTW */
if (has_local_input_regs) {
local_id_ctrl_word = pvr_pds_get_constants(&next_constant, 1, &data_size);
}
if (program->add_base_workgroup) {
for (uint32_t workgroup_component = 0; workgroup_component < 3;
workgroup_component++) {
if (program->work_group_input_regs[workgroup_component] !=
PVR_PDS_REG_UNUSED) {
program
->base_workgroup_constant_offset_in_dwords[workgroup_component] =
pvr_pds_get_constants(&next_constant, 1, &data_size);
}
/* Patch constants. */
for (uint32_t wg_comp = 0; wg_comp < 3; ++wg_comp) {
if (has_work_group_input_reg[wg_comp]) {
program->base_workgroup_constant_offset_in_dwords[wg_comp] =
pvr_pds_get_constants(&next_constant, 1, &data_size);
}
}
if (has_num_work_groups_regs) {
/* Ensure 64-bit alignment. */
program->num_workgroups_constant_offset_in_dwords[0] =
pvr_pds_get_constants(&next_constant, 2, &data_size);
program->num_workgroups_constant_offset_in_dwords[1] =
program->num_workgroups_constant_offset_in_dwords[0];
program->num_workgroups_constant_offset_in_dwords[2] =
pvr_pds_get_constants(&next_constant, 1, &data_size);
}
if (gen_mode == PDS_GENERATE_DATA_SEGMENT) {
if (program->kick_usc) {
/* Src0 for DOUTU */
@ -1883,15 +1941,14 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
program->usc_task_control_coeff_update.src0); /* 64-bit Src0 */
}
if ((program->barrier_coefficient != PVR_PDS_REG_UNUSED) ||
(program->clear_pds_barrier) ||
if (has_barrier_coefficient || program->clear_pds_barrier ||
(program->kick_usc && program->conditional_render)) {
pvr_pds_write_wide_constant(buffer, zero_constant64, 0); /* 64-bit
* Src0
*/
}
if (program->barrier_coefficient != PVR_PDS_REG_UNUSED) {
if (has_barrier_coefficient) {
if (PVR_HAS_QUIRK(dev_info, 51210)) {
/* Write the constant for the coefficient register write. */
doutw = pvr_pds_encode_doutw_src1(
@ -1913,18 +1970,16 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
/* Check whether the barrier is going to be the last DOUTW done by
* the coefficient sync task.
*/
if ((program->work_group_input_regs[0] == PVR_PDS_REG_UNUSED) &&
(program->work_group_input_regs[1] == PVR_PDS_REG_UNUSED) &&
(program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED)) {
if (!has_work_group_input_regs && !has_num_work_groups_regs)
doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
}
pvr_pds_write_constant32(buffer, barrier_ctrl_word, doutw);
}
/**/
/* If we want work-group id X, see if we also want work-group id Y. */
if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED &&
program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) {
if (has_work_group_input_reg[0] && has_work_group_input_reg[1]) {
/* Make sure we are going to DOUTW them into adjacent registers
* otherwise we can't do it in one.
*/
@ -1940,14 +1995,14 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
/* If we don't want the Z work-group id then this is the last one.
*/
if (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED)
if (!has_work_group_input_reg[2] && !has_num_work_groups_regs)
doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[0], doutw);
}
/* If we only want one of X or Y then handle them separately. */
else {
if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) {
if (has_work_group_input_reg[0]) {
doutw = pvr_pds_encode_doutw_src1(
program->work_group_input_regs[0],
PVR_PDS_DOUTW_LOWER32,
@ -1958,13 +2013,13 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
/* If we don't want the Z work-group id then this is the last
* one.
*/
if (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED)
if (!has_work_group_input_reg[2] && !has_num_work_groups_regs)
doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
pvr_pds_write_constant32(buffer,
work_group_id_ctrl_words[0],
doutw);
} else if (program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) {
} else if (has_work_group_input_reg[1]) {
doutw = pvr_pds_encode_doutw_src1(
program->work_group_input_regs[1],
PVR_PDS_DOUTW_UPPER32,
@ -1975,7 +2030,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
/* If we don't want the Z work-group id then this is the last
* one.
*/
if (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED)
if (!has_work_group_input_reg[2] && !has_num_work_groups_regs)
doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
pvr_pds_write_constant32(buffer,
@ -1985,35 +2040,111 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
}
/* Handle work-group id Z. */
if (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED) {
if (has_work_group_input_reg[2]) {
doutw = pvr_pds_encode_doutw_src1(
program->work_group_input_regs[2],
PVR_PDS_DOUTW_UPPER32,
PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
true,
dev_info);
if (!has_num_work_groups_regs)
doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[1], doutw);
}
/**/
/* If we want num work-groups X, see if we also want num work-groups Y. */
if (has_num_work_groups_reg[0] && has_num_work_groups_reg[1]) {
/* Make sure we are going to DOUTW them into adjacent registers
* otherwise we can't do it in one.
*/
assert(program->num_work_groups_regs[1] ==
(program->num_work_groups_regs[0] + 1));
doutw = pvr_pds_encode_doutw_src1(
program->num_work_groups_regs[0],
PVR_PDS_DOUTW_LOWER64,
PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
true,
dev_info);
/* If we don't want num work-groups Z then this is the last one.
*/
if (!has_num_work_groups_reg[2])
doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
pvr_pds_write_constant32(buffer, num_work_groups_ctrl_words[0], doutw);
}
/* If we only want one of X or Y then handle them separately. */
else {
if (has_num_work_groups_reg[0]) {
doutw = pvr_pds_encode_doutw_src1(
program->num_work_groups_regs[0],
PVR_PDS_DOUTW_LOWER32,
PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
true,
dev_info);
/* If we don't want num work-groups Z then this is the last
* one.
*/
if (has_num_work_groups_reg[2])
doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
pvr_pds_write_constant32(buffer,
num_work_groups_ctrl_words[0],
doutw);
} else if (has_num_work_groups_reg[1]) {
doutw = pvr_pds_encode_doutw_src1(
program->num_work_groups_regs[1],
PVR_PDS_DOUTW_UPPER32,
PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE,
true,
dev_info);
/* If we don't want num work-groups Z then this is the last
* one.
*/
if (!has_num_work_groups_reg[2])
doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN;
pvr_pds_write_constant32(buffer,
num_work_groups_ctrl_words[0],
doutw);
}
}
/* Handle num work-groups Z. */
if (has_num_work_groups_reg[2]) {
doutw = pvr_pds_encode_doutw_src1(
program->num_work_groups_regs[2],
PVR_PDS_DOUTW_LOWER32,
PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE |
PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN,
true,
dev_info);
pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[1], doutw);
pvr_pds_write_constant32(buffer, num_work_groups_ctrl_words[1], doutw);
}
/* Handle the local IDs. */
if ((program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
(program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) {
if (has_local_input_reg[1] || has_local_input_reg[2]) {
uint32_t dest_reg;
/* If we want local id Y and Z make sure the compiler wants them in
* the same register.
*/
if (!program->flattened_work_groups) {
if ((program->local_input_regs[1] != PVR_PDS_REG_UNUSED) &&
(program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) {
if (has_local_input_reg[1] && has_local_input_reg[2]) {
assert(program->local_input_regs[1] ==
program->local_input_regs[2]);
}
}
if (program->local_input_regs[1] != PVR_PDS_REG_UNUSED)
if (has_local_input_reg[1])
dest_reg = program->local_input_regs[1];
else
dest_reg = program->local_input_regs[2];
@ -2021,7 +2152,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
/* If we want local id X and (Y or Z) then we can do that in a
* single 64-bit DOUTW.
*/
if (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) {
if (has_local_input_reg[0]) {
assert(dest_reg == (program->local_input_regs[0] + 1));
doutw = pvr_pds_encode_doutw_src1(
@ -2052,7 +2183,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
}
/* If we don't want Y or Z then just DMA in X in a single 32-bit DOUTW.
*/
else if (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) {
else if (has_local_input_reg[0]) {
doutw = pvr_pds_encode_doutw_src1(
program->local_input_regs[0],
PVR_PDS_DOUTW_LOWER32,
@ -2090,7 +2221,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
program->coeff_update_task_branch_size /* ADDR */));
/* Do we need to initialize the barrier coefficient? */
if (program->barrier_coefficient != PVR_PDS_REG_UNUSED) {
if (has_barrier_coefficient) {
if (PVR_HAS_QUIRK(dev_info, 51210)) {
/* Initialize the second barrier coefficient registers to zero.
*/
@ -2106,51 +2237,42 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
zero_constant64 >> 1)); /* SRC0 */
}
if (program->add_base_workgroup) {
const uint32_t temp_values[3] = { 0, 1, 3 };
for (uint32_t workgroup_component = 0; workgroup_component < 3;
workgroup_component++) {
if (program->work_group_input_regs[workgroup_component] ==
PVR_PDS_REG_UNUSED) {
continue;
}
/* Add base workgroup to workgroup ids. */
for (uint32_t wg_comp = 0; wg_comp < 3; ++wg_comp) {
if (!has_work_group_input_reg[wg_comp])
continue;
APPEND(pvr_pds_inst_encode_add32(
/* cc */ 0x0,
/* ALUM */ 0,
/* SNA */ 0,
/* SRC0 (R32)*/ PVR_ROGUE_PDSINST_REGS32_CONST32_LOWER +
program->base_workgroup_constant_offset_in_dwords
[workgroup_component],
/* SRC1 (R32)*/ PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER +
PVR_PDS_CDM_WORK_GROUP_ID_X +
temp_values[workgroup_component],
/* DST (R32TP)*/ PVR_ROGUE_PDSINST_REGS32TP_TEMP32_LOWER +
PVR_PDS_CDM_WORK_GROUP_ID_X +
temp_values[workgroup_component]));
}
APPEND(pvr_pds_inst_encode_add32(
/* cc */ 0x0,
/* ALUM */ 0,
/* SNA */ 0,
/* SRC0 (R32)*/ PVR_ROGUE_PDSINST_REGS32_CONST32_LOWER +
program->base_workgroup_constant_offset_in_dwords[wg_comp],
/* SRC1 (R32)*/ PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER +
wg_id_temps[wg_comp],
/* DST (R32TP)*/ PVR_ROGUE_PDSINST_REGS32TP_TEMP32_LOWER +
wg_id_temps[wg_comp]));
}
/* If we are going to put the work-group IDs in coefficients then we
* just need to do the DOUTWs.
*/
if ((program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) ||
(program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED)) {
uint32_t dest_reg;
if (has_work_group_input_reg[0] || has_work_group_input_reg[1]) {
uint32_t src_reg;
if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED)
dest_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_X;
if (has_work_group_input_reg[0])
src_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_X;
else
dest_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_Y;
src_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_Y;
APPEND(pvr_pds_encode_doutw64(0, /* cc */
0, /* END */
work_group_id_ctrl_words[0], /* SRC1
*/
dest_reg >> 1)); /* SRC0 */
src_reg >> 1)); /* SRC0 */
}
if (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED) {
if (has_work_group_input_reg[2]) {
APPEND(pvr_pds_encode_doutw64(
0, /* cc */
0, /* END */
@ -2159,6 +2281,35 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
1)); /* SRC0 */
}
/* If we are going to put the num work-groups in coefficients then we
* just need to do the DOUTWs.
*/
if (has_num_work_groups_reg[0] || has_num_work_groups_reg[1]) {
uint32_t src_reg;
if (has_num_work_groups_reg[0])
src_reg = PVR_PDS_CONSTANTS_BLOCK_BASE +
program->num_workgroups_constant_offset_in_dwords[0];
else
src_reg = PVR_PDS_CONSTANTS_BLOCK_BASE +
program->num_workgroups_constant_offset_in_dwords[1];
APPEND(pvr_pds_encode_doutw64(0, /* cc */
0, /* END */
num_work_groups_ctrl_words[0], /* SRC1 */
src_reg >> 1)); /* SRC0 */
}
if (has_num_work_groups_reg[2]) {
APPEND(pvr_pds_encode_doutw64(
0, /* cc */
0, /* END */
num_work_groups_ctrl_words[1], /* SRC1 */
(PVR_PDS_CONSTANTS_BLOCK_BASE +
program->num_workgroups_constant_offset_in_dwords[2]) >>
1)); /* SRC0 */
}
/* Issue the task to the USC. */
if (program->kick_usc && program->has_coefficient_update_task) {
APPEND(pvr_pds_encode_doutu(0, /* cc */
@ -2176,28 +2327,24 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program,
/* DOUTW in the local IDs. */
/* If we want X and Y or Z, we only need one DOUTW. */
if ((program->local_input_regs[0] != PVR_PDS_REG_UNUSED) &&
((program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
(program->local_input_regs[2] != PVR_PDS_REG_UNUSED))) {
if (has_local_input_reg[0] &&
(has_local_input_reg[1] || has_local_input_reg[2])) {
local_input_register =
PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_X;
} else {
/* If we just want X. */
if (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) {
if (has_local_input_reg[0]) {
local_input_register =
PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_X;
}
/* If we just want Y or Z. */
else if (program->local_input_regs[1] != PVR_PDS_REG_UNUSED ||
program->local_input_regs[2] != PVR_PDS_REG_UNUSED) {
else if (has_local_input_reg[1] || has_local_input_reg[2]) {
local_input_register =
PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_YZ;
}
}
if ((program->local_input_regs[0] != PVR_PDS_REG_UNUSED) ||
(program->local_input_regs[1] != PVR_PDS_REG_UNUSED) ||
(program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) {
if (has_local_input_regs) {
APPEND(pvr_pds_encode_doutw64(0, /* cc */
0, /* END */
local_id_ctrl_word, /* SRC1 */

View file

@ -539,6 +539,7 @@ struct pvr_pds_compute_shader_program {
uint32_t local_input_regs[3];
uint32_t work_group_input_regs[3];
uint32_t num_work_groups_regs[3];
uint32_t global_input_regs[3];
uint32_t barrier_coefficient;
@ -553,8 +554,8 @@ struct pvr_pds_compute_shader_program {
uint32_t coeff_update_task_branch_size;
bool add_base_workgroup;
uint32_t base_workgroup_constant_offset_in_dwords[3];
uint32_t num_workgroups_constant_offset_in_dwords[3];
bool kick_usc;
@ -585,6 +586,11 @@ static inline void pvr_pds_compute_shader_program_init(
PVR_PDS_REG_UNUSED,
PVR_PDS_REG_UNUSED,
},
.num_work_groups_regs = {
PVR_PDS_REG_UNUSED,
PVR_PDS_REG_UNUSED,
PVR_PDS_REG_UNUSED,
},
.global_input_regs = {
PVR_PDS_REG_UNUSED,
PVR_PDS_REG_UNUSED,

View file

@ -2609,26 +2609,37 @@ void pvr_CmdBindDescriptorSets2KHR(
PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
struct pvr_descriptor_state *desc_state =
&cmd_buffer->state.gfx_desc_state;
struct pvr_descriptor_state *graphics_desc_state =
&cmd_buffer->state.gfx_desc_state;
struct pvr_descriptor_state *compute_desc_state =
&cmd_buffer->state.compute_desc_state;
for (unsigned u = 0; u < pBindDescriptorSetsInfo->descriptorSetCount;
++u) {
VK_FROM_HANDLE(pvr_descriptor_set,
set,
pBindDescriptorSetsInfo->pDescriptorSets[u]);
unsigned desc_set = u + pBindDescriptorSetsInfo->firstSet;
for (unsigned u = 0; u < pBindDescriptorSetsInfo->descriptorSetCount; ++u) {
VK_FROM_HANDLE(pvr_descriptor_set,
set,
pBindDescriptorSetsInfo->pDescriptorSets[u]);
unsigned desc_set = u + pBindDescriptorSetsInfo->firstSet;
if (desc_state->sets[desc_set] != set) {
desc_state->sets[desc_set] = set;
desc_state->dirty_sets |= BITFIELD_BIT(desc_set);
if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) {
if (graphics_desc_state->sets[desc_set] != set) {
graphics_desc_state->sets[desc_set] = set;
graphics_desc_state->dirty_sets |= BITFIELD_BIT(desc_set);
}
}
cmd_buffer->state.dirty.gfx_desc_dirty = true;
if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
if (compute_desc_state->sets[desc_set] != set) {
compute_desc_state->sets[desc_set] = set;
compute_desc_state->dirty_sets |= BITFIELD_BIT(desc_set);
}
}
}
assert(!(pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT));
if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS)
cmd_buffer->state.dirty.gfx_desc_dirty = true;
if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)
cmd_buffer->state.dirty.compute_desc_dirty = true;
}
void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
@ -3600,8 +3611,7 @@ static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_csb *csb = &sub_cmd->control_stream;
const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
const uint32_t const_shared_regs =
pipeline->shader_state.const_shared_reg_count;
const uint32_t const_shared_regs = pipeline->cs_data.common.shareds;
struct pvr_compute_kernel_info info;
/* No shared regs, no need to use an allocation kernel. */
@ -3624,7 +3634,7 @@ static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
.usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL,
.usc_common_shared = true,
.usc_common_size =
DIV_ROUND_UP(const_shared_regs,
DIV_ROUND_UP(PVR_DW_TO_BYTES(const_shared_regs),
ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE),
.global_size = { 1, 1, 1 },
@ -3748,8 +3758,6 @@ void pvr_compute_update_kernel_private(
const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
{
const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
const struct pvr_device_runtime_info *dev_runtime_info =
&pdevice->dev_runtime_info;
struct pvr_csb *csb = &sub_cmd->control_stream;
struct pvr_compute_kernel_info info = {
@ -3783,15 +3791,8 @@ void pvr_compute_update_kernel_private(
uint32_t work_size = pipeline->workgroup_size.width *
pipeline->workgroup_size.height *
pipeline->workgroup_size.depth;
uint32_t coeff_regs;
if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
/* Enforce a single workgroup per cluster through allocation starvation.
*/
coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
} else {
coeff_regs = pipeline->coeff_regs_count;
}
uint32_t coeff_regs =
pipeline->coeff_regs_count + pipeline->const_shared_regs_count;
info.usc_common_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
@ -3800,8 +3801,6 @@ void pvr_compute_update_kernel_private(
/* Use a whole slot per workgroup. */
work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
coeff_regs += pipeline->const_shared_regs_count;
if (pipeline->const_shared_regs_count > 0)
info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;
@ -3818,24 +3817,53 @@ void pvr_compute_update_kernel_private(
pvr_compute_generate_control_stream(csb, sub_cmd, &info);
}
/* TODO: Wire up the base_workgroup variant program when implementing
* VK_KHR_device_group. The values will also need patching into the program.
*/
static void pvr_compute_update_kernel(
struct pvr_cmd_buffer *cmd_buffer,
struct pvr_sub_cmd_compute *const sub_cmd,
pvr_dev_addr_t indirect_addr,
const uint32_t global_base_group[static const PVR_WORKGROUP_DIMENSIONS],
const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
{
const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
const struct pvr_device_runtime_info *dev_runtime_info =
&pdevice->dev_runtime_info;
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
struct pvr_csb *csb = &sub_cmd->control_stream;
const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
const struct pvr_compute_shader_state *shader_state =
&pipeline->shader_state;
const struct pvr_pds_info *program_info = &pipeline->primary_program_info;
const pco_data *const cs_data = &pipeline->cs_data;
const struct pvr_pds_info *program_info = &pipeline->pds_cs_program_info;
bool uses_wg_id = pipeline->base_workgroup_data_patching_offset != ~0u;
bool uses_num_wgs = pipeline->num_workgroups_data_patching_offset != ~0u;
bool base_group_set = !!global_base_group[0] || !!global_base_group[1] ||
!!global_base_group[2];
uint32_t pds_data_offset = pipeline->pds_cs_program.data_offset;
/* Does the PDS data segment need patching, or can the default be used? */
if ((uses_wg_id && base_group_set) || uses_num_wgs) {
struct pvr_pds_upload pds_data_upload;
uint32_t *pds_data;
/* Upload and patch PDS data segment. */
pvr_cmd_buffer_upload_pds_data(cmd_buffer,
pipeline->pds_cs_data_section,
program_info->data_size_in_dwords,
16,
&pds_data_upload);
pds_data_offset = pds_data_upload.data_offset;
pds_data = pvr_bo_suballoc_get_map_addr(pds_data_upload.pvr_bo);
if (uses_wg_id && base_group_set) {
unsigned offset = pipeline->base_workgroup_data_patching_offset;
for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
pds_data[offset + u] = global_base_group[u];
}
}
if (uses_num_wgs) {
unsigned offset = pipeline->num_workgroups_data_patching_offset;
for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
pds_data[offset + u] = global_workgroup_size[u];
}
}
}
struct pvr_compute_kernel_info info = {
.indirect_buffer_addr = indirect_addr,
@ -3847,13 +3875,13 @@ static void pvr_compute_update_kernel(
.pds_data_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords),
ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE),
.pds_data_offset = pipeline->primary_program.data_offset,
.pds_code_offset = pipeline->primary_program.code_offset,
.pds_data_offset = pds_data_offset,
.pds_code_offset = pipeline->pds_cs_program.code_offset,
.sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE,
.usc_unified_size =
DIV_ROUND_UP(shader_state->input_register_count << 2U,
DIV_ROUND_UP(cs_data->common.vtxins << 2U,
ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE),
/* clang-format off */
@ -3865,16 +3893,10 @@ static void pvr_compute_update_kernel(
/* clang-format on */
};
uint32_t work_size = shader_state->work_size;
uint32_t coeff_regs;
if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
/* Enforce a single workgroup per cluster through allocation starvation.
*/
coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
} else {
coeff_regs = shader_state->coefficient_register_count;
}
uint32_t work_size = cs_data->cs.workgroup_size[0] *
cs_data->cs.workgroup_size[1] *
cs_data->cs.workgroup_size[2];
uint32_t coeff_regs = cs_data->common.coeffs + cs_data->common.shareds;
info.usc_common_size =
DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
@ -3883,9 +3905,7 @@ static void pvr_compute_update_kernel(
/* Use a whole slot per workgroup. */
work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
coeff_regs += shader_state->const_shared_reg_count;
if (shader_state->const_shared_reg_count > 0)
if (cs_data->common.shareds > 0)
info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC;
work_size =
@ -3947,19 +3967,21 @@ static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer)
static void pvr_cmd_dispatch(
struct pvr_cmd_buffer *const cmd_buffer,
const pvr_dev_addr_t indirect_addr,
const uint32_t base_group[static const PVR_WORKGROUP_DIMENSIONS],
const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
{
struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
const struct pvr_compute_pipeline *compute_pipeline =
state->compute_pipeline;
const pco_data *const cs_data = &compute_pipeline->cs_data;
struct pvr_sub_cmd_compute *sub_cmd;
VkResult result;
pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
sub_cmd = &state->current_sub_cmd->compute;
sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops;
sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier;
sub_cmd->uses_atomic_ops |= cs_data->common.uses.atomics;
sub_cmd->uses_barrier |= cs_data->common.uses.barriers;
if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
result = pvr_cmd_upload_push_consts(cmd_buffer);
@ -3972,16 +3994,33 @@ static void pvr_cmd_dispatch(
state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
}
UNREACHABLE("compute descriptor support");
if (state->dirty.compute_desc_dirty ||
state->dirty.compute_pipeline_binding) {
result = pvr_setup_descriptor_mappings(
cmd_buffer,
PVR_STAGE_ALLOCATION_COMPUTE,
&compute_pipeline->descriptor_state,
NULL,
&state->pds_compute_descriptor_data_offset);
if (result != VK_SUCCESS)
return;
}
pvr_compute_update_shared(cmd_buffer, sub_cmd);
pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size);
pvr_compute_update_kernel(cmd_buffer,
sub_cmd,
indirect_addr,
base_group,
workgroup_size);
}
void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
uint32_t groupCountX,
uint32_t groupCountY,
uint32_t groupCountZ)
void pvr_CmdDispatchBase(VkCommandBuffer commandBuffer,
uint32_t baseGroupX,
uint32_t baseGroupY,
uint32_t baseGroupZ,
uint32_t groupCountX,
uint32_t groupCountY,
uint32_t groupCountZ)
{
PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
@ -3992,6 +4031,7 @@ void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
pvr_cmd_dispatch(cmd_buffer,
PVR_DEV_ADDR_INVALID,
(uint32_t[]){ baseGroupX, baseGroupY, baseGroupZ },
(uint32_t[]){ groupCountX, groupCountY, groupCountZ });
}
@ -4006,6 +4046,7 @@ void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
pvr_cmd_dispatch(cmd_buffer,
PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset),
(uint32_t[]){ 0, 0, 0 },
(uint32_t[]){ 1, 1, 1 });
}
@ -4070,7 +4111,7 @@ pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ALL;
state0.usc_common_size =
DIV_ROUND_UP(vs_data->common.shareds,
DIV_ROUND_UP(PVR_DW_TO_BYTES(vs_data->common.shareds),
ROGUE_VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE);
state0.pds_data_size = DIV_ROUND_UP(

View file

@ -102,14 +102,6 @@ static const struct pvr_hard_coding_data {
.shader_size = sizeof(pvr_simple_compute_shader),
.shader_info = {
.uses_atomic_ops = false,
.uses_barrier = false,
.uses_num_workgroups = false,
.const_shared_reg_count = 4,
.input_register_count = 8,
.work_size = 1 * 1 * 1,
.coefficient_register_count = 4,
},
.build_info = {

View file

@ -669,221 +669,164 @@ static void pvr_pds_descriptor_program_destroy(
static void pvr_pds_compute_program_setup(
const struct pvr_device_info *dev_info,
const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
uint32_t barrier_coefficient,
bool add_base_workgroup,
uint32_t usc_temps,
pvr_dev_addr_t usc_shader_dev_addr,
pco_data *cs_data,
struct pvr_compute_shader_state *compute_state,
struct pvr_pds_compute_shader_program *const program)
{
pco_range *sys_vals = cs_data->common.sys_vals;
pvr_pds_compute_shader_program_init(program);
program->local_input_regs[0] = local_input_regs[0];
program->local_input_regs[1] = local_input_regs[1];
program->local_input_regs[2] = local_input_regs[2];
program->work_group_input_regs[0] = work_group_input_regs[0];
program->work_group_input_regs[1] = work_group_input_regs[1];
program->work_group_input_regs[2] = work_group_input_regs[2];
program->barrier_coefficient = barrier_coefficient;
program->add_base_workgroup = add_base_workgroup;
if (sys_vals[SYSTEM_VALUE_LOCAL_INVOCATION_INDEX].count > 0) {
program->local_input_regs[0] =
sys_vals[SYSTEM_VALUE_LOCAL_INVOCATION_INDEX].start;
}
for (unsigned u = 0; u < ARRAY_SIZE(program->work_group_input_regs); ++u) {
if (sys_vals[SYSTEM_VALUE_WORKGROUP_ID].count > u) {
program->work_group_input_regs[u] =
sys_vals[SYSTEM_VALUE_WORKGROUP_ID].start + u;
}
}
for (unsigned u = 0; u < ARRAY_SIZE(program->num_work_groups_regs); ++u) {
if (sys_vals[SYSTEM_VALUE_NUM_WORKGROUPS].count > u) {
program->num_work_groups_regs[u] =
sys_vals[SYSTEM_VALUE_NUM_WORKGROUPS].start + u;
}
}
program->flattened_work_groups = true;
program->kick_usc = true;
STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
PVR_WORKGROUP_DIMENSIONS);
STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
PVR_WORKGROUP_DIMENSIONS);
STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
PVR_WORKGROUP_DIMENSIONS);
pvr_pds_setup_doutu(&program->usc_task_control,
usc_shader_dev_addr.addr,
usc_temps,
compute_state->bo->dev_addr.addr,
cs_data->common.temps,
ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
false);
pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
}
/* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
/* This uploads the code segment and base data segment variant.
* This can be patched at dispatch time.
*/
static VkResult pvr_pds_compute_program_create_and_upload(
struct pvr_device *const device,
const VkAllocationCallbacks *const allocator,
const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
uint32_t barrier_coefficient,
uint32_t usc_temps,
pvr_dev_addr_t usc_shader_dev_addr,
struct pvr_pds_upload *const pds_upload_out,
struct pvr_pds_info *const pds_info_out)
struct pvr_compute_shader_state *compute_state,
struct pvr_compute_pipeline *compute_pipeline)
{
pco_range *sys_vals = compute_pipeline->cs_data.common.sys_vals;
struct pvr_device_info *dev_info = &device->pdevice->dev_info;
struct pvr_pds_compute_shader_program program;
uint32_t staging_buffer_size;
uint32_t *staging_buffer;
uint32_t *code_buffer;
uint32_t *data_buffer;
VkResult result;
bool uses_wg_id = sys_vals[SYSTEM_VALUE_WORKGROUP_ID].count > 0;
bool uses_num_wgs = sys_vals[SYSTEM_VALUE_NUM_WORKGROUPS].count > 0;
pvr_pds_compute_program_setup(dev_info,
local_input_regs,
work_group_input_regs,
barrier_coefficient,
false,
usc_temps,
usc_shader_dev_addr,
&compute_pipeline->cs_data,
compute_state,
&program);
/* FIXME: According to pvr_device_init_compute_pds_program() the code size
* is in bytes. Investigate this.
*/
staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
staging_buffer = vk_alloc2(&device->vk.alloc,
allocator,
staging_buffer_size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!staging_buffer)
code_buffer = vk_alloc2(&device->vk.alloc,
allocator,
PVR_DW_TO_BYTES(program.code_size),
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!code_buffer)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
/* FIXME: pvr_pds_compute_shader doesn't implement
* PDS_GENERATE_CODEDATA_SEGMENTS.
*/
data_buffer = vk_alloc2(&device->vk.alloc,
allocator,
PVR_DW_TO_BYTES(program.code_size),
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!data_buffer) {
vk_free2(&device->vk.alloc, allocator, code_buffer);
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
}
pvr_pds_compute_shader(&program,
&staging_buffer[0],
&code_buffer[0],
PDS_GENERATE_CODE_SEGMENT,
dev_info);
pvr_pds_compute_shader(&program,
&staging_buffer[program.code_size],
&data_buffer[0],
PDS_GENERATE_DATA_SEGMENT,
dev_info);
/* Initialize. */
if (uses_wg_id) {
unsigned offset = program.base_workgroup_constant_offset_in_dwords[0];
for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
data_buffer[offset + u] = 0;
}
}
if (uses_num_wgs) {
unsigned offset = program.num_workgroups_constant_offset_in_dwords[0];
for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) {
data_buffer[offset + u] = 0;
}
}
/* FIXME: Figure out the define for alignment of 16. */
result = pvr_gpu_upload_pds(device,
&staging_buffer[program.code_size],
data_buffer,
program.data_size,
16,
&staging_buffer[0],
code_buffer,
program.code_size,
16,
16,
pds_upload_out);
&compute_pipeline->pds_cs_program);
if (result != VK_SUCCESS) {
vk_free2(&device->vk.alloc, allocator, staging_buffer);
vk_free2(&device->vk.alloc, allocator, code_buffer);
vk_free2(&device->vk.alloc, allocator, data_buffer);
return result;
}
*pds_info_out = (struct pvr_pds_info){
compute_pipeline->pds_cs_data_section = data_buffer;
/* The base workgroup and num workgroups can be patched in the
* PDS data section before dispatch so we save their offsets.
*/
compute_pipeline->base_workgroup_data_patching_offset = ~0u;
if (uses_wg_id) {
compute_pipeline->base_workgroup_data_patching_offset =
program.base_workgroup_constant_offset_in_dwords[0];
}
compute_pipeline->num_workgroups_data_patching_offset = ~0u;
if (uses_num_wgs) {
compute_pipeline->num_workgroups_data_patching_offset =
program.num_workgroups_constant_offset_in_dwords[0];
}
compute_pipeline->pds_cs_program_info = (struct pvr_pds_info){
.temps_required = program.highest_temp,
.code_size_in_dwords = program.code_size,
.data_size_in_dwords = program.data_size,
};
vk_free2(&device->vk.alloc, allocator, staging_buffer);
return VK_SUCCESS;
};
static void pvr_pds_compute_program_destroy(
struct pvr_device *const device,
const struct VkAllocationCallbacks *const allocator,
struct pvr_pds_upload *const pds_program,
struct pvr_pds_info *const pds_info)
{
/* We don't allocate an entries buffer so we don't need to free it */
pvr_bo_suballoc_free(pds_program->pvr_bo);
}
/* This only uploads the code segment. The data segment will need to be patched
* with the base workgroup before uploading.
*/
static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
struct pvr_device *const device,
const VkAllocationCallbacks *const allocator,
const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
uint32_t barrier_coefficient,
uint32_t usc_temps,
pvr_dev_addr_t usc_shader_dev_addr,
struct pvr_pds_base_workgroup_program *program_out)
{
struct pvr_device_info *dev_info = &device->pdevice->dev_info;
struct pvr_pds_compute_shader_program program;
uint32_t buffer_size;
uint32_t *buffer;
VkResult result;
pvr_pds_compute_program_setup(dev_info,
local_input_regs,
work_group_input_regs,
barrier_coefficient,
true,
usc_temps,
usc_shader_dev_addr,
&program);
/* FIXME: According to pvr_device_init_compute_pds_program() the code size
* is in bytes. Investigate this.
*/
buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size));
buffer = vk_alloc2(&device->vk.alloc,
allocator,
buffer_size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
if (!buffer)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
pvr_pds_compute_shader(&program,
&buffer[0],
PDS_GENERATE_CODE_SEGMENT,
dev_info);
/* FIXME: Figure out the define for alignment of 16. */
result = pvr_gpu_upload_pds(device,
NULL,
0,
0,
buffer,
program.code_size,
16,
16,
&program_out->code_upload);
if (result != VK_SUCCESS) {
vk_free2(&device->vk.alloc, allocator, buffer);
return result;
}
pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
program_out->data_section = buffer;
/* We'll need to patch the base workgroup in the PDS data section before
* dispatch so we save the offsets at which to patch. We only need to save
* the offset for the first workgroup id since the workgroup ids are stored
* contiguously in the data segment.
*/
program_out->base_workgroup_data_patching_offset =
program.base_workgroup_constant_offset_in_dwords[0];
program_out->info = (struct pvr_pds_info){
.temps_required = program.highest_temp,
.code_size_in_dwords = program.code_size,
.data_size_in_dwords = program.data_size,
};
vk_free2(&device->vk.alloc, allocator, code_buffer);
return VK_SUCCESS;
}
static void pvr_pds_compute_base_workgroup_variant_program_finish(
struct pvr_device *device,
const VkAllocationCallbacks *const allocator,
struct pvr_pds_base_workgroup_program *const state)
static void
pvr_pds_compute_program_destroy(struct pvr_device *device,
const VkAllocationCallbacks *const allocator,
struct pvr_pds_upload *const pds_cs_program,
uint32_t *pds_cs_data_section)
{
pvr_bo_suballoc_free(state->code_upload.pvr_bo);
vk_free2(&device->vk.alloc, allocator, state->data_section);
pvr_bo_suballoc_free(pds_cs_program->pvr_bo);
vk_free2(&device->vk.alloc, allocator, pds_cs_data_section);
}
/******************************************************************************
@ -917,10 +860,28 @@ static void pvr_pipeline_finish(struct pvr_device *device,
#define PVR_DEV_ADDR_SIZE_IN_SH_REGS \
DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t))
static void pvr_preprocess_shader_data(pco_data *data,
nir_shader *nir,
const void *pCreateInfo,
struct vk_pipeline_layout *layout);
static void pvr_postprocess_shader_data(pco_data *data,
nir_shader *nir,
const void *pCreateInfo,
struct vk_pipeline_layout *layout);
/******************************************************************************
Compute pipeline functions
******************************************************************************/
static void
pvr_compute_state_save(struct pvr_compute_pipeline *compute_pipeline,
pco_shader *cs)
{
const pco_data *shader_data = pco_shader_data(cs);
memcpy(&compute_pipeline->cs_data, shader_data, sizeof(*shader_data));
}
/* Compiles and uploads shaders and PDS programs. */
static VkResult pvr_compute_pipeline_compile(
struct pvr_device *const device,
@ -930,71 +891,76 @@ static VkResult pvr_compute_pipeline_compile(
struct pvr_compute_pipeline *const compute_pipeline)
{
struct vk_pipeline_layout *layout = compute_pipeline->base.layout;
uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
uint32_t barrier_coefficient;
uint32_t usc_temps;
const uint32_t cache_line_size =
rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
pco_ctx *pco_ctx = device->pdevice->pco_ctx;
void *shader_mem_ctx = ralloc_context(NULL);
pco_data shader_data = { 0 };
nir_shader *nir;
pco_shader *cs;
struct pvr_compute_shader_state *compute_state =
&compute_pipeline->shader_state;
VkResult result;
compute_pipeline->shader_state.const_shared_reg_count = 0;
result =
vk_pipeline_shader_stage_to_nir(&device->vk,
compute_pipeline->base.pipeline_flags,
&pCreateInfo->stage,
pco_spirv_options(),
pco_nir_options(),
shader_mem_ctx,
&nir);
if (result != VK_SUCCESS)
goto err_free_build_context;
/* FIXME: Compile and upload the shader. */
/* FIXME: Initialize the shader state and setup build info. */
UNREACHABLE("finishme: compute support");
pco_preprocess_nir(pco_ctx, nir);
pvr_preprocess_shader_data(&shader_data, nir, pCreateInfo, layout);
pco_lower_nir(pco_ctx, nir, &shader_data);
pco_postprocess_nir(pco_ctx, nir, &shader_data);
pvr_postprocess_shader_data(&shader_data, nir, pCreateInfo, layout);
cs = pco_trans_nir(pco_ctx, nir, &shader_data, shader_mem_ctx);
if (!cs) {
result = VK_ERROR_INITIALIZATION_FAILED;
goto err_free_build_context;
}
pco_process_ir(pco_ctx, cs);
pco_encode_ir(pco_ctx, cs);
pvr_compute_state_save(compute_pipeline, cs);
result = pvr_gpu_upload_usc(device,
pco_shader_binary_data(cs),
pco_shader_binary_size(cs),
cache_line_size,
&compute_pipeline->shader_state.bo);
if (result != VK_SUCCESS)
goto err_free_build_context;
result = pvr_pds_descriptor_program_create_and_upload(
device,
allocator,
layout,
MESA_SHADER_COMPUTE,
NULL,
&compute_pipeline->cs_data,
&compute_pipeline->descriptor_state);
if (result != VK_SUCCESS)
goto err_free_shader;
result = pvr_pds_compute_program_create_and_upload(
device,
allocator,
local_input_regs,
work_group_input_regs,
barrier_coefficient,
usc_temps,
compute_pipeline->shader_state.bo->dev_addr,
&compute_pipeline->primary_program,
&compute_pipeline->primary_program_info);
result = pvr_pds_compute_program_create_and_upload(device,
allocator,
compute_state,
compute_pipeline);
if (result != VK_SUCCESS)
goto err_free_descriptor_program;
/* If the workgroup ID is required, then we require the base workgroup
* variant of the PDS compute program as well.
*/
compute_pipeline->flags.base_workgroup =
work_group_input_regs[0] != PVR_PDS_REG_UNUSED ||
work_group_input_regs[1] != PVR_PDS_REG_UNUSED ||
work_group_input_regs[2] != PVR_PDS_REG_UNUSED;
if (compute_pipeline->flags.base_workgroup) {
result = pvr_pds_compute_base_workgroup_variant_program_init(
device,
allocator,
local_input_regs,
work_group_input_regs,
barrier_coefficient,
usc_temps,
compute_pipeline->shader_state.bo->dev_addr,
&compute_pipeline->primary_base_workgroup_variant_program);
if (result != VK_SUCCESS)
goto err_destroy_compute_program;
}
ralloc_free(shader_mem_ctx);
return VK_SUCCESS;
err_destroy_compute_program:
pvr_pds_compute_program_destroy(device,
allocator,
&compute_pipeline->primary_program,
&compute_pipeline->primary_program_info);
err_free_descriptor_program:
pvr_pds_descriptor_program_destroy(device,
allocator,
@ -1003,6 +969,8 @@ err_free_descriptor_program:
err_free_shader:
pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
err_free_build_context:
ralloc_free(shader_mem_ctx);
return result;
}
@ -1067,27 +1035,24 @@ pvr_compute_pipeline_create(struct pvr_device *device,
return VK_SUCCESS;
}
static void pvr_pipeline_destroy_shader_data(pco_data *data);
static void pvr_compute_pipeline_destroy(
struct pvr_device *const device,
const VkAllocationCallbacks *const allocator,
struct pvr_compute_pipeline *const compute_pipeline)
{
if (compute_pipeline->flags.base_workgroup) {
pvr_pds_compute_base_workgroup_variant_program_finish(
device,
allocator,
&compute_pipeline->primary_base_workgroup_variant_program);
}
pvr_pds_compute_program_destroy(device,
allocator,
&compute_pipeline->primary_program,
&compute_pipeline->primary_program_info);
&compute_pipeline->pds_cs_program,
compute_pipeline->pds_cs_data_section);
pvr_pds_descriptor_program_destroy(device,
allocator,
&compute_pipeline->descriptor_state);
pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
pvr_pipeline_destroy_shader_data(&compute_pipeline->cs_data);
pvr_pipeline_finish(device, &compute_pipeline->base);
vk_free2(&device->vk.alloc, allocator, compute_pipeline);
@ -1915,6 +1880,58 @@ static void pvr_setup_fs_input_attachments(
pvr_finishme("pvr_setup_fs_input_attachments");
}
static void pvr_alloc_cs_sysvals(pco_data *data, nir_shader *nir)
{
BITSET_DECLARE(system_values_read, SYSTEM_VALUE_MAX);
BITSET_COPY(system_values_read, nir->info.system_values_read);
gl_system_value vtxin_sys_vals[] = {
SYSTEM_VALUE_LOCAL_INVOCATION_INDEX,
};
gl_system_value coeff_sys_vals[] = {
SYSTEM_VALUE_WORKGROUP_ID,
SYSTEM_VALUE_NUM_WORKGROUPS,
};
for (unsigned u = 0; u < ARRAY_SIZE(vtxin_sys_vals); ++u) {
if (BITSET_TEST(system_values_read, vtxin_sys_vals[u])) {
nir_intrinsic_op op =
nir_intrinsic_from_system_value(vtxin_sys_vals[u]);
unsigned dwords = nir_intrinsic_infos[op].dest_components;
assert(dwords > 0);
allocate_val(data->common.sys_vals,
&data->common.vtxins,
vtxin_sys_vals[u],
dwords);
BITSET_CLEAR(system_values_read, vtxin_sys_vals[u]);
}
}
for (unsigned u = 0; u < ARRAY_SIZE(coeff_sys_vals); ++u) {
if (BITSET_TEST(system_values_read, coeff_sys_vals[u])) {
nir_intrinsic_op op =
nir_intrinsic_from_system_value(coeff_sys_vals[u]);
unsigned dwords = nir_intrinsic_infos[op].dest_components;
assert(dwords > 0);
if (dwords > 1 && data->common.coeffs & 1)
++data->common.coeffs;
allocate_val(data->common.sys_vals,
&data->common.coeffs,
coeff_sys_vals[u],
dwords);
BITSET_CLEAR(system_values_read, coeff_sys_vals[u]);
}
}
assert(BITSET_IS_EMPTY(system_values_read));
}
static void pvr_init_descriptors(pco_data *data,
nir_shader *nir,
struct vk_pipeline_layout *layout)
@ -1985,27 +2002,28 @@ static void pvr_setup_descriptors(pco_data *data,
assert(data->common.shareds < 256);
}
static void
pvr_preprocess_shader_data(pco_data *data,
nir_shader *nir,
const VkGraphicsPipelineCreateInfo *pCreateInfo,
struct vk_pipeline_layout *layout)
static void pvr_preprocess_shader_data(pco_data *data,
nir_shader *nir,
const void *pCreateInfo,
struct vk_pipeline_layout *layout)
{
const VkGraphicsPipelineCreateInfo *pGraphicsCreateInfo = pCreateInfo;
switch (nir->info.stage) {
case MESA_SHADER_VERTEX: {
const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
pCreateInfo->pVertexInputState;
pGraphicsCreateInfo->pVertexInputState;
pvr_init_vs_attribs(data, vertex_input_state);
break;
}
case MESA_SHADER_FRAGMENT: {
PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
PVR_FROM_HANDLE(pvr_render_pass, pass, pGraphicsCreateInfo->renderPass);
const struct pvr_render_subpass *const subpass =
&pass->subpasses[pCreateInfo->subpass];
&pass->subpasses[pGraphicsCreateInfo->subpass];
const struct pvr_renderpass_hw_map *subpass_map =
&pass->hw_setup->subpass_map[pCreateInfo->subpass];
&pass->hw_setup->subpass_map[pGraphicsCreateInfo->subpass];
const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
&pass->hw_setup->renders[subpass_map->render]
.subpasses[subpass_map->subpass];
@ -2017,6 +2035,10 @@ pvr_preprocess_shader_data(pco_data *data,
break;
}
case MESA_SHADER_COMPUTE: {
break;
}
default:
UNREACHABLE("");
}
@ -2026,12 +2048,13 @@ pvr_preprocess_shader_data(pco_data *data,
/* TODO: common things, like large constants being put into shareds. */
}
static void
pvr_postprocess_shader_data(pco_data *data,
nir_shader *nir,
const VkGraphicsPipelineCreateInfo *pCreateInfo,
struct vk_pipeline_layout *layout)
static void pvr_postprocess_shader_data(pco_data *data,
nir_shader *nir,
const void *pCreateInfo,
struct vk_pipeline_layout *layout)
{
const VkGraphicsPipelineCreateInfo *pGraphicsCreateInfo = pCreateInfo;
switch (nir->info.stage) {
case MESA_SHADER_VERTEX: {
pvr_alloc_vs_sysvals(data, nir);
@ -2041,11 +2064,11 @@ pvr_postprocess_shader_data(pco_data *data,
}
case MESA_SHADER_FRAGMENT: {
PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass);
PVR_FROM_HANDLE(pvr_render_pass, pass, pGraphicsCreateInfo->renderPass);
const struct pvr_render_subpass *const subpass =
&pass->subpasses[pCreateInfo->subpass];
&pass->subpasses[pGraphicsCreateInfo->subpass];
const struct pvr_renderpass_hw_map *subpass_map =
&pass->hw_setup->subpass_map[pCreateInfo->subpass];
&pass->hw_setup->subpass_map[pGraphicsCreateInfo->subpass];
const struct pvr_renderpass_hwsetup_subpass *hw_subpass =
&pass->hw_setup->renders[subpass_map->render]
.subpasses[subpass_map->subpass];
@ -2059,6 +2082,11 @@ pvr_postprocess_shader_data(pco_data *data,
break;
}
case MESA_SHADER_COMPUTE: {
pvr_alloc_cs_sysvals(data, nir);
break;
}
default:
UNREACHABLE("");
}

View file

@ -862,16 +862,6 @@ struct pvr_pipeline_stage_state {
struct pvr_compute_shader_state {
/* Pointer to a buffer object that contains the shader binary. */
struct pvr_suballoc_bo *bo;
bool uses_atomic_ops;
bool uses_barrier;
/* E.g. GLSL shader uses gl_NumWorkGroups. */
bool uses_num_workgroups;
uint32_t const_shared_reg_count;
uint32_t input_register_count;
uint32_t work_size;
uint32_t coefficient_register_count;
};
struct pvr_vertex_shader_state {
@ -910,28 +900,17 @@ struct pvr_pipeline {
struct pvr_compute_pipeline {
struct pvr_pipeline base;
pco_data cs_data;
struct pvr_compute_shader_state shader_state;
struct {
uint32_t base_workgroup : 1;
} flags;
struct pvr_stage_allocation_descriptor_state descriptor_state;
struct pvr_pds_upload primary_program;
struct pvr_pds_info primary_program_info;
struct pvr_pds_upload pds_cs_program;
struct pvr_pds_info pds_cs_program_info;
struct pvr_pds_base_workgroup_program {
struct pvr_pds_upload code_upload;
uint32_t *data_section;
/* Offset within the PDS data section at which the base workgroup id
* resides.
*/
uint32_t base_workgroup_data_patching_offset;
struct pvr_pds_info info;
} primary_base_workgroup_variant_program;
uint32_t *pds_cs_data_section;
uint32_t base_workgroup_data_patching_offset;
uint32_t num_workgroups_data_patching_offset;
};
struct pvr_graphics_pipeline {