mesa/src/panfrost/genxml/cs_builder.h
Boris Brezillon 0bf0f4d3de pan/cs: Don't leak builder resources
cs_finish() is doing two things:

1. wrapping up the CS to prepare for its execution
2. freeing the temporary instrs array and maybe_ctx allocations

Mixing those two things lead to confusion and leaks, so let's split
those into cs_end() and cs_builder_fini(), and make sure panvk/panfrost
call both when appropriate.

Fixes: 50d2396b7e ("pan/cs: add helpers to emit contiguous csf code blocks")
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39003>
2025-12-17 16:57:41 +01:00

2687 lines
82 KiB
C

/*
* Copyright (C) 2022 Collabora Ltd.
* Copyright (C) 2025 Arm Ltd.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#pragma once
#if !defined(PAN_ARCH) || PAN_ARCH < 10
#error "cs_builder.h requires PAN_ARCH >= 10"
#endif
#include "gen_macros.h"
#include "util/bitset.h"
#include "util/u_dynarray.h"
#ifdef __cplusplus
extern "C" {
#endif
/* Before Avalon, RUN_IDVS could use a selector but as we only hardcode the same
* configuration, we match v12+ naming here */
#if PAN_ARCH <= 11
#define MALI_IDVS_SR_VERTEX_SRT MALI_IDVS_SR_SRT_0
#define MALI_IDVS_SR_FRAGMENT_SRT MALI_IDVS_SR_SRT_2
#define MALI_IDVS_SR_VERTEX_FAU MALI_IDVS_SR_FAU_0
#define MALI_IDVS_SR_FRAGMENT_FAU MALI_IDVS_SR_FAU_2
#define MALI_IDVS_SR_VERTEX_POS_SPD MALI_IDVS_SR_SPD_0
#define MALI_IDVS_SR_VERTEX_VARY_SPD MALI_IDVS_SR_SPD_1
#define MALI_IDVS_SR_FRAGMENT_SPD MALI_IDVS_SR_SPD_2
#endif
#if PAN_ARCH == 10
#define CS_MAX_SB_COUNT 8
#else
#define CS_MAX_SB_COUNT 16
#endif
/*
* cs_builder implements a builder for CSF command streams. It manages the
* allocation and overflow behaviour of queues and provides helpers for emitting
* commands to run on the CSF pipe.
*
* Users are responsible for the CS buffer allocation and must initialize the
* command stream with an initial buffer using cs_builder_init(). The CS can
* be extended with new buffers allocated with cs_builder_conf::alloc_buffer()
* if the builder runs out of memory.
*/
struct cs_buffer {
/* CPU pointer */
uint64_t *cpu;
/* GPU pointer */
uint64_t gpu;
/* Capacity in number of 64-bit instructions */
uint32_t capacity;
};
/**
* This is used to check that:
* 1. registers are not used as a source after being loaded without a
* WAIT(<ls_scoreboard>) in the middle
* 2. registers are not reused (used as a destination) after they served as a
* STORE() source without a WAIT(<ls_scoreboard>) in the middle
*/
struct cs_load_store_tracker {
BITSET_DECLARE(pending_loads, 256);
bool pending_stores;
};
/**
* This is used to determine which registers as been written to (a.k.a. used
* as an instruction's destination).
*/
struct cs_dirty_tracker {
BITSET_DECLARE(regs, 256);
};
enum cs_reg_perm {
CS_REG_NO_ACCESS = 0,
CS_REG_RD = BITFIELD_BIT(1),
CS_REG_WR = BITFIELD_BIT(2),
CS_REG_RW = CS_REG_RD | CS_REG_WR,
};
struct cs_builder;
typedef enum cs_reg_perm (*reg_perm_cb_t)(struct cs_builder *b, unsigned reg);
struct cs_builder_conf {
/* Number of 32-bit registers in the hardware register file */
uint8_t nr_registers;
/* Number of 32-bit registers used by the kernel at submission time */
uint8_t nr_kernel_registers;
/* CS buffer allocator */
struct cs_buffer (*alloc_buffer)(void *cookie);
/* Optional dirty registers tracker. */
struct cs_dirty_tracker *dirty_tracker;
/* Optional register access checker. */
reg_perm_cb_t reg_perm;
/* Cookie passed back to alloc_buffer() */
void *cookie;
/* SB slot used for load/store instructions. */
uint8_t ls_sb_slot;
};
/* The CS is formed of one or more CS chunks linked with JUMP instructions.
* The builder keeps track of the current chunk and the position inside this
* chunk, so it can emit new instructions, and decide when a new chunk needs
* to be allocated.
*/
struct cs_chunk {
/* CS buffer object backing this chunk */
struct cs_buffer buffer;
union {
/* Current position in the buffer object when the chunk is active. */
uint32_t pos;
/* Chunk size when the chunk was wrapped. */
uint32_t size;
};
};
/* Monolithic sequence of instruction. Must live in a virtually contiguous
* portion of code.
*/
struct cs_block {
/* Used to insert the block in the block stack. */
struct cs_block *next;
};
#define CS_LABEL_INVALID_POS ~0u
/* Labels can only be used inside a cs_block. They can be defined and
* referenced before they are set to point to a specific position
* in the block. */
struct cs_label {
/* The last reference we have seen pointing to this block before
* it was set. If set to CS_LABEL_INVALID_POS, no forward reference
* pointing to this label exist.
*/
uint32_t last_forward_ref;
/* The label target. If set to CS_LABEL_INVALID_POS, the label has
* not been set yet.
*/
uint32_t target;
};
/* CS if/else block. */
struct cs_if_else {
struct cs_block block;
struct cs_label end_label;
struct cs_load_store_tracker *orig_ls_state;
struct cs_load_store_tracker ls_state;
};
struct cs_maybe {
/* Link to the next pending cs_maybe for the block stack */
struct cs_maybe *next_pending;
/* Position of patch block relative to blocks.instrs */
uint32_t patch_pos;
/* CPU address of patch block in the chunk */
uint64_t *patch_addr;
/* Original contents of the patch block, before replacing with NOPs */
uint32_t num_instrs;
uint64_t instrs[];
};
struct cs_builder {
/* CS builder configuration */
struct cs_builder_conf conf;
/* True if an allocation failed, making the whole CS invalid. */
bool invalid;
/* Initial (root) CS chunk. */
struct cs_chunk root_chunk;
/* Current CS chunk. */
struct cs_chunk cur_chunk;
/* Current load/store tracker. */
struct cs_load_store_tracker *cur_ls_tracker;
struct cs_load_store_tracker root_ls_tracker;
/* ralloc context used for cs_maybe allocations */
void *maybe_ctx;
/* Temporary storage for inner blocks that need to be built
* and copied in one monolithic sequence of instructions with no
* jump in the middle.
*/
struct {
struct cs_block *stack;
struct util_dynarray instrs;
struct cs_if_else pending_if;
/* Linked list of cs_maybe that were emitted inside the current stack */
struct cs_maybe *pending_maybes;
unsigned last_load_ip_target;
} blocks;
/* Move immediate instruction at the end of the last CS chunk that needs to
* be patched with the final length of the current CS chunk in order to
* facilitate correct overflow behaviour.
*/
uint32_t *length_patch;
/* Used as temporary storage when the allocator couldn't allocate a new
* CS chunk.
*/
uint64_t discard_instr_slot;
};
static inline void
cs_builder_init(struct cs_builder *b, const struct cs_builder_conf *conf,
struct cs_buffer root_buffer)
{
memset(b, 0, sizeof(*b));
util_dynarray_init(&b->blocks.instrs, NULL);
b->conf = *conf;
b->root_chunk.buffer = root_buffer;
b->cur_chunk.buffer = root_buffer;
b->cur_ls_tracker = &b->root_ls_tracker,
memset(b->cur_ls_tracker, 0, sizeof(*b->cur_ls_tracker));
/* We need at least 3 registers for CS chunk linking. Assume the kernel needs
* at least that too.
*/
b->conf.nr_kernel_registers = MAX2(b->conf.nr_kernel_registers, 3);
util_dynarray_init(&b->blocks.instrs, NULL);
}
static inline void
cs_builder_fini(struct cs_builder *b)
{
util_dynarray_fini(&b->blocks.instrs);
ralloc_free(b->maybe_ctx);
}
static inline bool
cs_is_valid(struct cs_builder *b)
{
return !b->invalid;
}
static inline bool
cs_is_empty(struct cs_builder *b)
{
return b->cur_chunk.pos == 0 &&
b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu;
}
static inline uint64_t
cs_root_chunk_gpu_addr(struct cs_builder *b)
{
return b->root_chunk.buffer.gpu;
}
static inline uint32_t
cs_root_chunk_size(struct cs_builder *b)
{
/* Make sure cs_end() was called. */
struct cs_chunk empty_chunk;
memset(&empty_chunk, 0, sizeof(empty_chunk));
assert(!memcmp(&b->cur_chunk, &empty_chunk, sizeof(b->cur_chunk)));
return b->root_chunk.size * sizeof(uint64_t);
}
/*
* Wrap the current queue. External users shouldn't call this function
* directly, they should call cs_end() when they are done building
* the command stream, which will in turn call cs_wrap_queue().
*
* Internally, this is also used to finalize internal CS chunks when
* allocating new sub-chunks. See cs_alloc_chunk() for details.
*
* This notably requires patching the previous chunk with the length
* we ended up emitting for this chunk.
*/
static inline void
cs_wrap_chunk(struct cs_builder *b)
{
if (!cs_is_valid(b))
return;
if (b->length_patch) {
*b->length_patch = (b->cur_chunk.pos * 8);
b->length_patch = NULL;
}
if (b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu)
b->root_chunk.size = b->cur_chunk.size;
}
enum cs_index_type {
CS_INDEX_REGISTER = 0,
CS_INDEX_UNDEF,
};
struct cs_index {
enum cs_index_type type;
/* Number of 32-bit words in the index, must be nonzero */
uint8_t size;
union {
uint64_t imm;
uint8_t reg;
};
};
static inline struct cs_index
cs_undef(void)
{
return (struct cs_index){
.type = CS_INDEX_UNDEF,
};
}
static inline uint8_t
cs_to_reg_tuple(struct cs_index idx, ASSERTED unsigned expected_size)
{
assert(idx.type == CS_INDEX_REGISTER);
assert(idx.size == expected_size);
return idx.reg;
}
static inline void cs_flush_load_to(struct cs_builder *b, struct cs_index to,
uint16_t mask);
static inline unsigned
cs_src_tuple(struct cs_builder *b, struct cs_index src, ASSERTED unsigned count,
uint16_t mask)
{
unsigned reg = cs_to_reg_tuple(src, count);
if (unlikely(b->conf.reg_perm)) {
for (unsigned i = reg; i < reg + count; i++) {
if (mask & BITFIELD_BIT(i - reg)) {
assert((b->conf.reg_perm(b, i) & CS_REG_RD) ||
!"Trying to read a restricted register");
}
}
}
cs_flush_load_to(b, src, mask);
return reg;
}
static inline unsigned
cs_src32(struct cs_builder *b, struct cs_index src)
{
return cs_src_tuple(b, src, 1, BITFIELD_MASK(1));
}
static inline unsigned
cs_src64(struct cs_builder *b, struct cs_index src)
{
return cs_src_tuple(b, src, 2, BITFIELD_MASK(2));
}
static inline unsigned
cs_dst_tuple(struct cs_builder *b, struct cs_index dst, ASSERTED unsigned count,
uint16_t mask)
{
unsigned reg = cs_to_reg_tuple(dst, count);
/* A load followed by another op with the same dst register can overwrite
* the result of that following op if there is no wait. For example:
* load(dst, addr)
* move(dst, v)
*/
cs_flush_load_to(b, dst, mask);
if (unlikely(b->conf.reg_perm)) {
for (unsigned i = reg; i < reg + count; i++) {
if (mask & BITFIELD_BIT(i - reg)) {
assert((b->conf.reg_perm(b, i) & CS_REG_WR) ||
!"Trying to write a restricted register");
}
}
}
if (unlikely(b->conf.dirty_tracker)) {
for (unsigned i = reg; i < reg + count; i++) {
if (mask & BITFIELD_BIT(i - reg))
BITSET_SET(b->conf.dirty_tracker->regs, i);
}
}
return reg;
}
static inline unsigned
cs_dst32(struct cs_builder *b, struct cs_index dst)
{
return cs_dst_tuple(b, dst, 1, BITFIELD_MASK(1));
}
static inline unsigned
cs_dst64(struct cs_builder *b, struct cs_index dst)
{
return cs_dst_tuple(b, dst, 2, BITFIELD_MASK(2));
}
#define CS_MAX_REG_TUPLE_SIZE 16
static inline struct cs_index
cs_reg_tuple(ASSERTED struct cs_builder *b, uint8_t reg, uint8_t size)
{
assert(reg + size <= b->conf.nr_registers - b->conf.nr_kernel_registers &&
"overflowed register file");
assert(size <= CS_MAX_REG_TUPLE_SIZE && "unsupported");
return (struct cs_index){
.type = CS_INDEX_REGISTER,
.size = size,
.reg = reg,
};
}
static inline struct cs_index
cs_reg32(struct cs_builder *b, unsigned reg)
{
return cs_reg_tuple(b, reg, 1);
}
static inline struct cs_index
cs_reg64(struct cs_builder *b, unsigned reg)
{
assert((reg % 2) == 0 && "unaligned 64-bit reg");
return cs_reg_tuple(b, reg, 2);
}
#define cs_sr_reg_tuple(__b, __cmd, __name, __size) \
cs_reg_tuple((__b), MALI_##__cmd##_SR_##__name, (__size))
#define cs_sr_reg32(__b, __cmd, __name) \
cs_reg32((__b), MALI_##__cmd##_SR_##__name)
#define cs_sr_reg64(__b, __cmd, __name) \
cs_reg64((__b), MALI_##__cmd##_SR_##__name)
/*
* The top of the register file is reserved for cs_builder internal use. We
* need 3 spare registers for handling command queue overflow. These are
* available here.
*/
static inline uint8_t
cs_overflow_address_reg(struct cs_builder *b)
{
return b->conf.nr_registers - 2;
}
static inline uint8_t
cs_overflow_length_reg(struct cs_builder *b)
{
return b->conf.nr_registers - 3;
}
static inline struct cs_index
cs_extract32(struct cs_builder *b, struct cs_index idx, unsigned word)
{
assert(idx.type == CS_INDEX_REGISTER && "unsupported");
assert(word < idx.size && "overrun");
return cs_reg32(b, idx.reg + word);
}
static inline struct cs_index
cs_extract64(struct cs_builder *b, struct cs_index idx, unsigned word)
{
assert(idx.type == CS_INDEX_REGISTER && "unsupported");
assert(word + 1 < idx.size && "overrun");
return cs_reg64(b, idx.reg + word);
}
static inline struct cs_index
cs_extract_tuple(struct cs_builder *b, struct cs_index idx, unsigned word,
unsigned size)
{
assert(idx.type == CS_INDEX_REGISTER && "unsupported");
assert(word + size < idx.size && "overrun");
return cs_reg_tuple(b, idx.reg + word, size);
}
static inline struct cs_block *
cs_cur_block(struct cs_builder *b)
{
return b->blocks.stack;
}
#define JUMP_SEQ_INSTR_COUNT 4
static inline bool
cs_reserve_instrs(struct cs_builder *b, uint32_t num_instrs)
{
/* Don't call this function with num_instrs=0. */
assert(num_instrs > 0);
assert(cs_cur_block(b) == NULL);
/* If an allocation failure happened before, we just discard all following
* instructions.
*/
if (unlikely(!cs_is_valid(b)))
return false;
/* Make sure we have sufficient capacity if we wont allocate more */
if (b->conf.alloc_buffer == NULL) {
if (unlikely(b->cur_chunk.size + num_instrs > b->cur_chunk.buffer.capacity)) {
assert(!"Out of CS space");
b->invalid = true;
return false;
}
return true;
}
/* Lazy root chunk allocation. */
if (unlikely(!b->root_chunk.buffer.cpu)) {
b->root_chunk.buffer = b->conf.alloc_buffer(b->conf.cookie);
b->cur_chunk.buffer = b->root_chunk.buffer;
if (!b->cur_chunk.buffer.cpu) {
b->invalid = true;
return false;
}
}
/* Make sure the instruction sequence fits in a single chunk. */
assert(b->cur_chunk.buffer.capacity >= num_instrs);
/* If the current chunk runs out of space, allocate a new one and jump to it.
* We actually do this a few instructions before running out, because the
* sequence to jump to a new queue takes multiple instructions.
*/
bool jump_to_next_chunk =
(b->cur_chunk.size + num_instrs + JUMP_SEQ_INSTR_COUNT) >
b->cur_chunk.buffer.capacity;
if (unlikely(jump_to_next_chunk)) {
/* Now, allocate a new chunk */
struct cs_buffer newbuf = b->conf.alloc_buffer(b->conf.cookie);
/* Allocation failure, from now on, all new instructions will be
* discarded.
*/
if (unlikely(!newbuf.cpu)) {
b->invalid = true;
return false;
}
uint64_t *ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
pan_cast_and_pack(ptr, CS_MOVE48, I) {
I.destination = cs_overflow_address_reg(b);
I.immediate = newbuf.gpu;
}
ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
pan_cast_and_pack(ptr, CS_MOVE32, I) {
I.destination = cs_overflow_length_reg(b);
}
/* The length will be patched in later */
uint32_t *length_patch = (uint32_t *)ptr;
ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
pan_cast_and_pack(ptr, CS_JUMP, I) {
I.length = cs_overflow_length_reg(b);
I.address = cs_overflow_address_reg(b);
}
/* Now that we've emitted everything, finish up the previous queue */
cs_wrap_chunk(b);
/* And make this one current */
b->length_patch = length_patch;
b->cur_chunk.buffer = newbuf;
b->cur_chunk.pos = 0;
}
return true;
}
static inline void *
cs_alloc_ins_block(struct cs_builder *b, uint32_t num_instrs)
{
if (cs_cur_block(b))
return util_dynarray_grow(&b->blocks.instrs, uint64_t, num_instrs);
if (!cs_reserve_instrs(b, num_instrs))
return NULL;
assert(b->cur_chunk.size + num_instrs - 1 < b->cur_chunk.buffer.capacity);
uint32_t pos = b->cur_chunk.pos;
b->cur_chunk.pos += num_instrs;
return b->cur_chunk.buffer.cpu + pos;
}
static inline void
cs_flush_block_instrs(struct cs_builder *b)
{
if (cs_cur_block(b) != NULL)
return;
uint32_t num_instrs =
util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
if (!num_instrs)
return;
/* If LOAD_IP is the last instruction in the block, we reserve one more
* slot to make sure the next instruction won't point to a CS chunk linking
* sequence. */
if (unlikely(b->blocks.last_load_ip_target >= num_instrs)) {
if (!cs_reserve_instrs(b, num_instrs + 1))
return;
}
void *buffer = cs_alloc_ins_block(b, num_instrs);
if (likely(buffer != NULL)) {
/* We wait until block instrs are copied to the chunk buffer to calculate
* patch_addr, in case we end up allocating a new chunk */
while (b->blocks.pending_maybes) {
b->blocks.pending_maybes->patch_addr =
(uint64_t *) buffer + b->blocks.pending_maybes->patch_pos;
b->blocks.pending_maybes = b->blocks.pending_maybes->next_pending;
}
/* If we have a LOAD_IP chain, we need to patch each LOAD_IP
* instruction before we copy the block to the final memory
* region. */
while (unlikely(b->blocks.last_load_ip_target)) {
uint64_t *instr = util_dynarray_element(
&b->blocks.instrs, uint64_t, b->blocks.last_load_ip_target - 1);
unsigned prev_load_ip_target = *instr & BITFIELD_MASK(32);
uint64_t ip =
b->cur_chunk.buffer.gpu +
((b->cur_chunk.pos - num_instrs + b->blocks.last_load_ip_target) *
sizeof(uint64_t));
/* Drop the prev_load_ip_target value and replace it by the final
* IP. */
*instr &= ~BITFIELD64_MASK(32);
*instr |= ip;
b->blocks.last_load_ip_target = prev_load_ip_target;
}
memcpy(buffer, b->blocks.instrs.data, b->blocks.instrs.size);
}
util_dynarray_clear(&b->blocks.instrs);
}
static inline uint32_t
cs_block_next_pos(struct cs_builder *b)
{
assert(cs_cur_block(b) != NULL);
return util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
}
static inline void
cs_label_init(struct cs_label *label)
{
label->last_forward_ref = CS_LABEL_INVALID_POS;
label->target = CS_LABEL_INVALID_POS;
}
static inline void
cs_set_label(struct cs_builder *b, struct cs_label *label)
{
assert(label->target == CS_LABEL_INVALID_POS);
label->target = cs_block_next_pos(b);
for (uint32_t next_forward_ref, forward_ref = label->last_forward_ref;
forward_ref != CS_LABEL_INVALID_POS; forward_ref = next_forward_ref) {
uint64_t *ins =
util_dynarray_element(&b->blocks.instrs, uint64_t, forward_ref);
assert(forward_ref < label->target);
assert(label->target - forward_ref <= INT16_MAX);
/* Save the next forward reference to this target before overwritting
* it with the final offset.
*/
int16_t offset = *ins & BITFIELD64_MASK(16);
next_forward_ref =
offset > 0 ? forward_ref - offset : CS_LABEL_INVALID_POS;
assert(next_forward_ref == CS_LABEL_INVALID_POS ||
next_forward_ref < forward_ref);
*ins &= ~BITFIELD64_MASK(16);
*ins |= label->target - forward_ref - 1;
}
}
static inline void
cs_flush_pending_if(struct cs_builder *b)
{
if (likely(cs_cur_block(b) != &b->blocks.pending_if.block))
return;
cs_set_label(b, &b->blocks.pending_if.end_label);
b->blocks.stack = b->blocks.pending_if.block.next;
cs_flush_block_instrs(b);
}
static inline void *
cs_alloc_ins(struct cs_builder *b)
{
/* If an instruction is emitted after an if_end(), it flushes the pending if,
* causing further cs_else_start() instructions to be invalid. */
cs_flush_pending_if(b);
return cs_alloc_ins_block(b, 1) ?: &b->discard_instr_slot;
}
/* Call this when you are done building a command stream and want to prepare
* it for submission.
*/
static inline void
cs_end(struct cs_builder *b)
{
if (!cs_is_valid(b))
return;
cs_flush_pending_if(b);
cs_wrap_chunk(b);
/* This prevents adding instructions after that point. */
memset(&b->cur_chunk, 0, sizeof(b->cur_chunk));
}
/*
* Helper to emit a new instruction into the command queue. The allocation needs
* to be separated out being pan_pack can evaluate its argument multiple times,
* yet cs_alloc has side effects.
*/
#define cs_emit(b, T, cfg) pan_cast_and_pack(cs_alloc_ins(b), CS_##T, cfg)
/* Asynchronous operations take a mask of scoreboard slots to wait on
* before executing the instruction, and signal a scoreboard slot when
* the operation is complete.
* On v11 and later, asynchronous operations can also wait on a scoreboard
* mask and signal a scoreboard slot indirectly instead (set via SET_STATE)
* A wait_mask of zero means the operation is synchronous, and signal_slot
* is ignored in that case.
*/
struct cs_async_op {
uint16_t wait_mask;
uint8_t signal_slot;
#if PAN_ARCH >= 11
bool indirect;
#endif
};
static inline struct cs_async_op
cs_defer(uint16_t wait_mask, uint8_t signal_slot)
{
/* The scoreboard slot to signal is incremented before the wait operation,
* waiting on it would cause an infinite wait.
*/
assert(!(wait_mask & BITFIELD_BIT(signal_slot)));
return (struct cs_async_op){
.wait_mask = wait_mask,
.signal_slot = signal_slot,
};
}
static inline struct cs_async_op
cs_now(void)
{
return (struct cs_async_op){
.wait_mask = 0,
.signal_slot = 0xff,
};
}
#if PAN_ARCH >= 11
static inline struct cs_async_op
cs_defer_indirect(void)
{
return (struct cs_async_op){
.wait_mask = 0xff,
.signal_slot = 0xff,
.indirect = true,
};
}
#endif
static inline bool
cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
{
switch (opcode) {
case MALI_CS_OPCODE_FLUSH_CACHE2:
case MALI_CS_OPCODE_FINISH_TILING:
case MALI_CS_OPCODE_LOAD_MULTIPLE:
case MALI_CS_OPCODE_STORE_MULTIPLE:
case MALI_CS_OPCODE_RUN_COMPUTE:
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
case MALI_CS_OPCODE_RUN_FRAGMENT:
case MALI_CS_OPCODE_RUN_FULLSCREEN:
#if PAN_ARCH >= 12
case MALI_CS_OPCODE_RUN_IDVS2:
#else
case MALI_CS_OPCODE_RUN_IDVS:
#if PAN_ARCH == 10
case MALI_CS_OPCODE_RUN_TILING:
#endif
#endif
/* Always asynchronous. */
return true;
case MALI_CS_OPCODE_FINISH_FRAGMENT:
case MALI_CS_OPCODE_SYNC_ADD32:
case MALI_CS_OPCODE_SYNC_SET32:
case MALI_CS_OPCODE_SYNC_ADD64:
case MALI_CS_OPCODE_SYNC_SET64:
case MALI_CS_OPCODE_STORE_STATE:
case MALI_CS_OPCODE_TRACE_POINT:
case MALI_CS_OPCODE_HEAP_OPERATION:
#if PAN_ARCH >= 11
case MALI_CS_OPCODE_SHARED_SB_INC:
#endif
/* Asynchronous only if wait_mask != 0. */
return wait_mask != 0;
default:
return false;
}
}
/* TODO: was the signal_slot comparison bugged? */
#if PAN_ARCH == 10
#define cs_apply_async(I, async) \
do { \
I.wait_mask = async.wait_mask; \
I.signal_slot = cs_instr_is_asynchronous(I.opcode, I.wait_mask) \
? async.signal_slot \
: 0; \
assert(I.signal_slot != 0xff || \
!"Can't use cs_now() on pure async instructions"); \
} while (0)
#else
#define cs_apply_async(I, async) \
do { \
if (async.indirect) { \
I.defer_mode = MALI_CS_DEFER_MODE_DEFER_INDIRECT; \
} else { \
I.defer_mode = MALI_CS_DEFER_MODE_DEFER_IMMEDIATE; \
I.wait_mask = async.wait_mask; \
I.signal_slot = cs_instr_is_asynchronous(I.opcode, I.wait_mask) \
? async.signal_slot \
: 0; \
assert(I.signal_slot != 0xff || \
!"Can't use cs_now() on pure async instructions"); \
} \
} while (0)
#endif
static inline void
cs_move32_to(struct cs_builder *b, struct cs_index dest, unsigned imm)
{
cs_emit(b, MOVE32, I) {
I.destination = cs_dst32(b, dest);
I.immediate = imm;
}
}
static inline void
cs_move48_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
{
cs_emit(b, MOVE48, I) {
I.destination = cs_dst64(b, dest);
I.immediate = imm;
}
}
static inline void
cs_load_ip_to(struct cs_builder *b, struct cs_index dest)
{
/* If a load_ip instruction is emitted after an if_end(), it flushes the
* pending if, causing further cs_else_start() instructions to be invalid.
*/
cs_flush_pending_if(b);
if (likely(cs_cur_block(b) == NULL)) {
if (!cs_reserve_instrs(b, 2))
return;
/* We make IP point to the instruction right after our MOVE. */
uint64_t ip =
b->cur_chunk.buffer.gpu + (sizeof(uint64_t) * (b->cur_chunk.pos + 1));
cs_move48_to(b, dest, ip);
} else {
cs_move48_to(b, dest, b->blocks.last_load_ip_target);
b->blocks.last_load_ip_target =
util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
}
}
static inline void
cs_wait_slots(struct cs_builder *b, unsigned wait_mask)
{
struct cs_load_store_tracker *ls_tracker = b->cur_ls_tracker;
assert(ls_tracker != NULL);
cs_emit(b, WAIT, I) {
I.wait_mask = wait_mask;
}
/* We don't do advanced tracking of cs_defer(), and assume that
* load/store will be flushed with an explicit wait on the load/store
* scoreboard. */
if (wait_mask & BITFIELD_BIT(b->conf.ls_sb_slot)) {
BITSET_CLEAR_RANGE(ls_tracker->pending_loads, 0, 255);
ls_tracker->pending_stores = false;
}
}
static inline void
cs_wait_slot(struct cs_builder *b, unsigned slot)
{
assert(slot < CS_MAX_SB_COUNT && "invalid slot");
cs_wait_slots(b, BITFIELD_BIT(slot));
}
static inline void
cs_flush_load_to(struct cs_builder *b, struct cs_index to, uint16_t mask)
{
struct cs_load_store_tracker *ls_tracker = b->cur_ls_tracker;
assert(ls_tracker != NULL);
unsigned count = util_last_bit(mask);
unsigned reg = cs_to_reg_tuple(to, count);
for (unsigned i = reg; i < reg + count; i++) {
if ((mask & BITFIELD_BIT(i - reg)) &&
BITSET_TEST(ls_tracker->pending_loads, i)) {
cs_wait_slots(b, BITFIELD_BIT(b->conf.ls_sb_slot));
break;
}
}
}
static inline void
cs_flush_loads(struct cs_builder *b)
{
struct cs_load_store_tracker *ls_tracker = b->cur_ls_tracker;
assert(ls_tracker != NULL);
if (!BITSET_IS_EMPTY(ls_tracker->pending_loads))
cs_wait_slots(b, BITFIELD_BIT(b->conf.ls_sb_slot));
}
static inline void
cs_flush_stores(struct cs_builder *b)
{
struct cs_load_store_tracker *ls_tracker = b->cur_ls_tracker;
assert(ls_tracker != NULL);
if (ls_tracker->pending_stores)
cs_wait_slots(b, BITFIELD_BIT(b->conf.ls_sb_slot));
}
static inline void
cs_block_start(struct cs_builder *b, struct cs_block *block)
{
cs_flush_pending_if(b);
block->next = b->blocks.stack;
b->blocks.stack = block;
}
static inline void
cs_block_end(struct cs_builder *b, struct cs_block *block)
{
cs_flush_pending_if(b);
assert(cs_cur_block(b) == block);
b->blocks.stack = block->next;
cs_flush_block_instrs(b);
}
static inline void
cs_branch(struct cs_builder *b, int offset, enum mali_cs_condition cond,
struct cs_index val)
{
cs_emit(b, BRANCH, I) {
I.offset = offset;
I.condition = cond;
I.value = cs_src32(b, val);
}
}
static inline void
cs_branch_label_cond32(struct cs_builder *b, struct cs_label *label,
enum mali_cs_condition cond, struct cs_index val)
{
assert(cs_cur_block(b) != NULL);
/* Call cs_src before cs_block_next_pos because cs_src can emit an extra
* WAIT instruction if there is a pending load.
*/
uint32_t val_reg = cond != MALI_CS_CONDITION_ALWAYS ? cs_src32(b, val) : 0;
if (label->target == CS_LABEL_INVALID_POS) {
uint32_t branch_ins_pos = cs_block_next_pos(b);
/* Instead of emitting a BRANCH with the final offset, we record the
* diff between the current branch, and the previous branch that was
* referencing this unset label. This way we build a single link list
* that can be walked when the label is set with cs_set_label().
* We use -1 as the end-of-list marker.
*/
int16_t offset = -1;
if (label->last_forward_ref != CS_LABEL_INVALID_POS) {
assert(label->last_forward_ref < branch_ins_pos);
assert(branch_ins_pos - label->last_forward_ref <= INT16_MAX);
offset = branch_ins_pos - label->last_forward_ref;
}
cs_emit(b, BRANCH, I) {
I.offset = offset;
I.condition = cond;
I.value = val_reg;
}
label->last_forward_ref = branch_ins_pos;
} else {
int32_t offset = label->target - cs_block_next_pos(b) - 1;
/* The branch target is encoded in a 16-bit signed integer, make sure we
* don't underflow.
*/
assert(offset >= INT16_MIN);
/* Backward references are easy, we can emit them immediately. */
cs_emit(b, BRANCH, I) {
I.offset = offset;
I.condition = cond;
I.value = val_reg;
}
}
}
static inline enum mali_cs_condition
cs_invert_cond(enum mali_cs_condition cond)
{
switch (cond) {
case MALI_CS_CONDITION_LEQUAL:
return MALI_CS_CONDITION_GREATER;
case MALI_CS_CONDITION_EQUAL:
return MALI_CS_CONDITION_NEQUAL;
case MALI_CS_CONDITION_LESS:
return MALI_CS_CONDITION_GEQUAL;
case MALI_CS_CONDITION_GREATER:
return MALI_CS_CONDITION_LEQUAL;
case MALI_CS_CONDITION_NEQUAL:
return MALI_CS_CONDITION_EQUAL;
case MALI_CS_CONDITION_GEQUAL:
return MALI_CS_CONDITION_LESS;
case MALI_CS_CONDITION_ALWAYS:
UNREACHABLE("cannot invert ALWAYS");
default:
UNREACHABLE("invalid cond");
}
}
static inline void
cs_branch_label_cond64(struct cs_builder *b, struct cs_label *label,
enum mali_cs_condition cond, struct cs_index val)
{
struct cs_label false_label;
cs_label_init(&false_label);
struct cs_index val_lo = cs_extract32(b, val, 0);
struct cs_index val_hi = cs_extract32(b, val, 1);
switch (cond) {
case MALI_CS_CONDITION_ALWAYS:
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_ALWAYS, cs_undef());
break;
case MALI_CS_CONDITION_LEQUAL:
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_LESS, val_hi);
cs_branch_label_cond32(b, &false_label, MALI_CS_CONDITION_NEQUAL, val_hi);
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_EQUAL, val_lo);
break;
case MALI_CS_CONDITION_GREATER:
cs_branch_label_cond32(b, &false_label, MALI_CS_CONDITION_LESS, val_hi);
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_NEQUAL, val_hi);
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_NEQUAL, val_lo);
break;
case MALI_CS_CONDITION_GEQUAL:
cs_branch_label_cond32(b, &false_label, MALI_CS_CONDITION_LESS, val_hi);
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_ALWAYS, cs_undef());
break;
case MALI_CS_CONDITION_LESS:
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_LESS, val_hi);
break;
case MALI_CS_CONDITION_NEQUAL:
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_NEQUAL, val_lo);
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_NEQUAL, val_hi);
break;
case MALI_CS_CONDITION_EQUAL:
cs_branch_label_cond32(b, &false_label, MALI_CS_CONDITION_NEQUAL, val_lo);
cs_branch_label_cond32(b, label, MALI_CS_CONDITION_EQUAL, val_hi);
break;
default:
UNREACHABLE("unsupported 64bit condition");
}
cs_set_label(b, &false_label);
}
static inline void
cs_branch_label(struct cs_builder *b, struct cs_label *label,
enum mali_cs_condition cond, struct cs_index val)
{
if (val.size == 2) {
cs_branch_label_cond64(b, label, cond, val);
} else {
cs_branch_label_cond32(b, label, cond, val);
}
}
static inline struct cs_if_else *
cs_if_start(struct cs_builder *b, struct cs_if_else *if_else,
enum mali_cs_condition cond, struct cs_index val)
{
cs_block_start(b, &if_else->block);
cs_label_init(&if_else->end_label);
cs_branch_label(b, &if_else->end_label, cs_invert_cond(cond), val);
if_else->orig_ls_state = b->cur_ls_tracker;
if_else->ls_state = *if_else->orig_ls_state;
b->cur_ls_tracker = &if_else->ls_state;
return if_else;
}
static inline void
cs_if_end(struct cs_builder *b, struct cs_if_else *if_else)
{
assert(cs_cur_block(b) == &if_else->block);
b->blocks.pending_if.block.next = if_else->block.next;
b->blocks.stack = &b->blocks.pending_if.block;
b->blocks.pending_if.end_label = if_else->end_label;
b->blocks.pending_if.orig_ls_state = if_else->orig_ls_state;
b->blocks.pending_if.ls_state = if_else->ls_state;
BITSET_OR(if_else->orig_ls_state->pending_loads,
if_else->orig_ls_state->pending_loads,
if_else->ls_state.pending_loads);
if_else->orig_ls_state->pending_stores |= if_else->ls_state.pending_stores;
b->cur_ls_tracker = if_else->orig_ls_state;
}
static inline struct cs_if_else *
cs_else_start(struct cs_builder *b, struct cs_if_else *if_else)
{
assert(cs_cur_block(b) == &b->blocks.pending_if.block);
if_else->block.next = b->blocks.pending_if.block.next;
b->blocks.stack = &if_else->block;
cs_label_init(&if_else->end_label);
cs_branch_label(b, &if_else->end_label, MALI_CS_CONDITION_ALWAYS,
cs_undef());
cs_set_label(b, &b->blocks.pending_if.end_label);
cs_label_init(&b->blocks.pending_if.end_label);
/* Restore the ls_tracker state from before the if block. */
if_else->orig_ls_state = b->blocks.pending_if.orig_ls_state;
if_else->ls_state = *if_else->orig_ls_state;
b->cur_ls_tracker = &if_else->ls_state;
return if_else;
}
static inline void
cs_else_end(struct cs_builder *b, struct cs_if_else *if_else)
{
struct cs_load_store_tracker if_ls_state = b->blocks.pending_if.ls_state;
cs_set_label(b, &if_else->end_label);
cs_block_end(b, &if_else->block);
BITSET_OR(if_else->orig_ls_state->pending_loads, if_ls_state.pending_loads,
if_else->ls_state.pending_loads);
if_else->orig_ls_state->pending_stores =
if_ls_state.pending_stores | if_else->ls_state.pending_stores;
b->cur_ls_tracker = if_else->orig_ls_state;
}
#define cs_if(__b, __cond, __val) \
for (struct cs_if_else __storage, \
*__if_else = cs_if_start(__b, &__storage, __cond, __val); \
__if_else != NULL; cs_if_end(__b, __if_else), __if_else = NULL)
#define cs_else(__b) \
for (struct cs_if_else __storage, \
*__if_else = cs_else_start(__b, &__storage); \
__if_else != NULL; cs_else_end(__b, __if_else), __if_else = NULL)
struct cs_loop {
struct cs_label start, end;
struct cs_block block;
enum mali_cs_condition cond;
struct cs_index val;
struct cs_load_store_tracker *orig_ls_state;
/* On continue we need to compare the original loads to the current ones.
* orig_ls_state can get updated from inside the loop. */
struct cs_load_store_tracker orig_ls_state_copy;
struct cs_load_store_tracker ls_state;
};
static inline void
cs_loop_diverge_ls_update(struct cs_builder *b, struct cs_loop *loop)
{
assert(loop->orig_ls_state);
BITSET_OR(loop->orig_ls_state->pending_loads,
loop->orig_ls_state->pending_loads,
b->cur_ls_tracker->pending_loads);
loop->orig_ls_state->pending_stores |= b->cur_ls_tracker->pending_stores;
}
static inline bool
cs_loop_continue_need_flush(struct cs_builder *b, struct cs_loop *loop)
{
assert(loop->orig_ls_state);
/* We need to flush on continue/loop-again, if there are loads to registers
* that did not already have loads in-flight before the loop.
* Those would have already gotten WAITs inserted because they were marked
* already.
*/
BITSET_DECLARE(new_pending_loads, 256);
BITSET_ANDNOT(new_pending_loads, b->cur_ls_tracker->pending_loads,
loop->orig_ls_state_copy.pending_loads);
return !BITSET_IS_EMPTY(new_pending_loads);
}
static inline struct cs_loop *
cs_loop_init(struct cs_builder *b, struct cs_loop *loop,
enum mali_cs_condition cond, struct cs_index val)
{
*loop = (struct cs_loop){
.cond = cond,
.val = val,
};
cs_block_start(b, &loop->block);
cs_label_init(&loop->start);
cs_label_init(&loop->end);
return loop;
}
static inline struct cs_loop *
cs_while_start(struct cs_builder *b, struct cs_loop *loop,
enum mali_cs_condition cond, struct cs_index val)
{
cs_loop_init(b, loop, cond, val);
/* Do an initial check on the condition, and if it's false, jump to
* the end of the loop block. For 'while(true)' loops, skip the
* conditional branch.
*/
if (cond != MALI_CS_CONDITION_ALWAYS)
cs_branch_label(b, &loop->end, cs_invert_cond(cond), val);
/* The loops ls tracker only needs to track the actual loop body, not the
* check to skip the whole body. */
loop->orig_ls_state = b->cur_ls_tracker;
loop->orig_ls_state_copy = *loop->orig_ls_state;
loop->ls_state = *loop->orig_ls_state;
b->cur_ls_tracker = &loop->ls_state;
cs_set_label(b, &loop->start);
return loop;
}
static inline void
cs_loop_conditional_continue(struct cs_builder *b, struct cs_loop *loop,
enum mali_cs_condition cond, struct cs_index val)
{
cs_flush_pending_if(b);
if (cs_loop_continue_need_flush(b, loop))
cs_flush_loads(b);
cs_branch_label(b, &loop->start, cond, val);
cs_loop_diverge_ls_update(b, loop);
}
static inline void
cs_loop_conditional_break(struct cs_builder *b, struct cs_loop *loop,
enum mali_cs_condition cond, struct cs_index val)
{
cs_flush_pending_if(b);
cs_branch_label(b, &loop->end, cond, val);
cs_loop_diverge_ls_update(b, loop);
}
static inline void
cs_while_end(struct cs_builder *b, struct cs_loop *loop)
{
cs_flush_pending_if(b);
if (cs_loop_continue_need_flush(b, loop))
cs_flush_loads(b);
cs_branch_label(b, &loop->start, loop->cond, loop->val);
cs_set_label(b, &loop->end);
cs_block_end(b, &loop->block);
if (unlikely(loop->orig_ls_state)) {
BITSET_OR(loop->orig_ls_state->pending_loads,
loop->orig_ls_state->pending_loads,
loop->ls_state.pending_loads);
loop->orig_ls_state->pending_stores |= loop->ls_state.pending_stores;
b->cur_ls_tracker = loop->orig_ls_state;
}
}
#define cs_while(__b, __cond, __val) \
for (struct cs_loop __loop_storage, \
*__loop = cs_while_start(__b, &__loop_storage, __cond, __val); \
__loop != NULL; cs_while_end(__b, __loop), __loop = NULL)
#define cs_continue(__b) \
cs_loop_conditional_continue(__b, __loop, MALI_CS_CONDITION_ALWAYS, \
cs_undef())
#define cs_break(__b) \
cs_loop_conditional_break(__b, __loop, MALI_CS_CONDITION_ALWAYS, cs_undef())
/* cs_maybe is an abstraction for retroactively patching cs contents. When the
* block is closed, its original contents are recorded and then replaced with
* NOP instructions. The caller can then use the cs_patch_maybe to restore the
* original contents at a later point. This can be useful in situations where
* not enough information is available during recording to determine what
* instructions should be emitted at the time, but it will be known at some
* point before submission. */
struct cs_maybe_state {
struct cs_block block;
uint32_t patch_pos;
struct cs_load_store_tracker ls_state;
struct cs_load_store_tracker *orig_ls_state;
};
static inline struct cs_maybe_state *
cs_maybe_start(struct cs_builder *b, struct cs_maybe_state *state)
{
cs_block_start(b, &state->block);
state->patch_pos = cs_block_next_pos(b);
/* store the original ls_tracker state so that we can revert to it after
* the maybe-block */
state->orig_ls_state = b->cur_ls_tracker;
state->ls_state = *b->cur_ls_tracker;
b->cur_ls_tracker = &state->ls_state;
return state;
}
static inline void
cs_maybe_end(struct cs_builder *b, struct cs_maybe_state *state,
struct cs_maybe **maybe)
{
assert(cs_cur_block(b) == &state->block);
/* Flush any new loads and stores */
BITSET_DECLARE(new_loads, 256);
BITSET_ANDNOT(new_loads, b->cur_ls_tracker->pending_loads,
state->orig_ls_state->pending_loads);
bool new_stores =
b->cur_ls_tracker->pending_stores && !state->orig_ls_state->pending_stores;
if (!BITSET_IS_EMPTY(new_loads) || new_stores)
cs_wait_slots(b, BITFIELD_BIT(b->conf.ls_sb_slot));
/* Restore the original ls tracker state */
b->cur_ls_tracker = state->orig_ls_state;
uint32_t num_instrs = cs_block_next_pos(b) - state->patch_pos;
size_t size = num_instrs * sizeof(uint64_t);
uint64_t *instrs = (uint64_t *) b->blocks.instrs.data + state->patch_pos;
if (!b->maybe_ctx)
b->maybe_ctx = ralloc_context(NULL);
*maybe = (struct cs_maybe *)
ralloc_size(b->maybe_ctx, sizeof(struct cs_maybe) + size);
(*maybe)->next_pending = b->blocks.pending_maybes;
b->blocks.pending_maybes = *maybe;
(*maybe)->patch_pos = state->patch_pos;
(*maybe)->num_instrs = num_instrs;
/* patch_addr will be computed later in cs_flush_block_instrs, when the
* outermost block is closed */
(*maybe)->patch_addr = NULL;
/* Save the emitted instructions in the patch block */
memcpy((*maybe)->instrs, instrs, size);
/* Replace instructions in the patch block with NOPs */
memset(instrs, 0, size);
cs_block_end(b, &state->block);
}
#define cs_maybe(__b, __maybe) \
for (struct cs_maybe_state __storage, \
*__state = cs_maybe_start(__b, &__storage); \
__state != NULL; cs_maybe_end(__b, __state, __maybe), \
__state = NULL)
/* Must be called before cs_end */
static inline void
cs_patch_maybe(struct cs_builder *b, struct cs_maybe *maybe)
{
if (maybe->patch_addr) {
/* Called after outer block was closed */
memcpy(maybe->patch_addr, maybe->instrs,
maybe->num_instrs * sizeof(uint64_t));
} else {
/* Called before outer block was closed */
memcpy((uint64_t *)b->blocks.instrs.data + maybe->patch_pos,
maybe->instrs, maybe->num_instrs * sizeof(uint64_t));
}
}
struct cs_single_link_list_node {
uint64_t next;
};
struct cs_single_link_list {
uint64_t head;
uint64_t tail;
};
/* Pseudoinstructions follow */
static inline void
cs_move64_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
{
if (imm < (1ull << 48)) {
/* Zero extends */
cs_move48_to(b, dest, imm);
} else {
cs_move32_to(b, cs_extract32(b, dest, 0), imm);
cs_move32_to(b, cs_extract32(b, dest, 1), imm >> 32);
}
}
struct cs_shader_res_sel {
uint8_t srt, fau, spd, tsd;
};
static inline struct cs_shader_res_sel
cs_shader_res_sel(uint8_t srt, uint8_t fau, uint8_t spd, uint8_t tsd)
{
return (struct cs_shader_res_sel){
.srt = srt,
.fau = fau,
.spd = spd,
.tsd = tsd,
};
}
static inline void
cs_run_compute(struct cs_builder *b, unsigned task_increment,
enum mali_task_axis task_axis, struct cs_shader_res_sel res_sel)
{
/* Staging regs */
cs_flush_loads(b);
cs_emit(b, RUN_COMPUTE, I) {
I.task_increment = task_increment;
I.task_axis = task_axis;
I.srt_select = res_sel.srt;
I.spd_select = res_sel.spd;
I.tsd_select = res_sel.tsd;
I.fau_select = res_sel.fau;
}
}
#if PAN_ARCH == 10
static inline void
cs_run_tiling(struct cs_builder *b, uint32_t flags_override,
struct cs_shader_res_sel res_sel)
{
/* Staging regs */
cs_flush_loads(b);
cs_emit(b, RUN_TILING, I) {
I.flags_override = flags_override;
I.srt_select = res_sel.srt;
I.spd_select = res_sel.spd;
I.tsd_select = res_sel.tsd;
I.fau_select = res_sel.fau;
}
}
#endif
#if PAN_ARCH >= 12
static inline void
cs_run_idvs2(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
struct cs_index draw_id,
enum mali_idvs_shading_mode vertex_shading_mode)
{
/* Staging regs */
cs_flush_loads(b);
cs_emit(b, RUN_IDVS2, I) {
I.flags_override = flags_override;
I.malloc_enable = malloc_enable;
I.vertex_shading_mode = vertex_shading_mode;
if (draw_id.type == CS_INDEX_UNDEF) {
I.draw_id_register_enable = false;
} else {
I.draw_id_register_enable = true;
I.draw_id = cs_src32(b, draw_id);
}
}
}
#else
static inline void
cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
struct cs_shader_res_sel varying_sel,
struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
{
/* Staging regs */
cs_flush_loads(b);
cs_emit(b, RUN_IDVS, I) {
I.flags_override = flags_override;
I.malloc_enable = malloc_enable;
if (draw_id.type == CS_INDEX_UNDEF) {
I.draw_id_register_enable = false;
} else {
I.draw_id_register_enable = true;
I.draw_id = cs_src32(b, draw_id);
}
assert(varying_sel.spd == 1);
assert(varying_sel.fau == 0 || varying_sel.fau == 1);
assert(varying_sel.srt == 0 || varying_sel.srt == 1);
assert(varying_sel.tsd == 0 || varying_sel.tsd == 1);
I.varying_fau_select = varying_sel.fau == 1;
I.varying_srt_select = varying_sel.srt == 1;
I.varying_tsd_select = varying_sel.tsd == 1;
assert(frag_sel.spd == 2);
assert(frag_sel.fau == 2);
assert(frag_sel.srt == 2 || frag_sel.srt == 0);
assert(frag_sel.tsd == 2 || frag_sel.tsd == 0);
I.fragment_srt_select = frag_sel.srt == 2;
I.fragment_tsd_select = frag_sel.tsd == 2;
}
}
#endif
static inline void
cs_run_fragment(struct cs_builder *b, bool enable_tem,
enum mali_tile_render_order tile_order)
{
/* Staging regs */
cs_flush_loads(b);
cs_emit(b, RUN_FRAGMENT, I) {
I.enable_tem = enable_tem;
I.tile_order = tile_order;
}
}
static inline void
cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
struct cs_index dcd)
{
/* Staging regs */
cs_flush_loads(b);
cs_emit(b, RUN_FULLSCREEN, I) {
I.flags_override = flags_override;
I.dcd = cs_src64(b, dcd);
}
}
static inline void
cs_finish_tiling(struct cs_builder *b)
{
cs_emit(b, FINISH_TILING, I)
;
}
static inline void
cs_finish_fragment(struct cs_builder *b, bool increment_frag_completed,
struct cs_index first_free_heap_chunk,
struct cs_index last_free_heap_chunk,
struct cs_async_op async)
{
cs_emit(b, FINISH_FRAGMENT, I) {
I.increment_fragment_completed = increment_frag_completed;
cs_apply_async(I, async);
I.first_heap_chunk = cs_src64(b, first_free_heap_chunk);
I.last_heap_chunk = cs_src64(b, last_free_heap_chunk);
}
}
static inline void
cs_add32(struct cs_builder *b, struct cs_index dest, struct cs_index src,
int32_t imm)
{
cs_emit(b, ADD_IMM32, I) {
I.destination = cs_dst32(b, dest);
I.source = cs_src32(b, src);
I.immediate = imm;
}
}
static inline void
cs_add64(struct cs_builder *b, struct cs_index dest, struct cs_index src,
int32_t imm)
{
cs_emit(b, ADD_IMM64, I) {
I.destination = cs_dst64(b, dest);
I.source = cs_src64(b, src);
I.immediate = imm;
}
}
static inline void
cs_umin32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
struct cs_index src2)
{
cs_emit(b, UMIN32, I) {
I.destination = cs_dst32(b, dest);
I.source_1 = cs_src32(b, src1);
I.source_0 = cs_src32(b, src2);
}
}
#if PAN_ARCH >= 11
static inline void
cs_and32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
struct cs_index src2)
{
cs_emit(b, AND32, I) {
I.destination = cs_dst32(b, dest);
I.source_1 = cs_src32(b, src1);
I.source_0 = cs_src32(b, src2);
}
}
static inline void
cs_or32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
struct cs_index src2)
{
cs_emit(b, OR32, I) {
I.destination = cs_dst32(b, dest);
I.source_1 = cs_src32(b, src1);
I.source_0 = cs_src32(b, src2);
}
}
static inline void
cs_xor32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
struct cs_index src2)
{
cs_emit(b, XOR32, I) {
I.destination = cs_dst32(b, dest);
I.source_1 = cs_src32(b, src1);
I.source_0 = cs_src32(b, src2);
}
}
static inline void
cs_not32(struct cs_builder *b, struct cs_index dest, struct cs_index src)
{
cs_emit(b, NOT32, I) {
I.destination = cs_dst32(b, dest);
I.source = cs_src32(b, src);
}
}
static inline void
cs_bit_set32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
struct cs_index src2)
{
cs_emit(b, BIT_SET32, I) {
I.destination = cs_dst32(b, dest);
I.source_0 = cs_src32(b, src1);
I.source_1 = cs_src32(b, src2);
}
}
static inline void
cs_bit_clear32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
struct cs_index src2)
{
cs_emit(b, BIT_CLEAR32, I) {
I.destination = cs_dst32(b, dest);
I.source_1 = cs_src32(b, src1);
I.source_0 = cs_src32(b, src2);
}
}
static inline void
cs_move_reg32(struct cs_builder *b, struct cs_index dest, struct cs_index src)
{
cs_emit(b, MOVE_REG32, I) {
I.destination = cs_dst32(b, dest);
I.source = cs_src32(b, src);
}
}
static inline void
cs_set_state(struct cs_builder *b, enum mali_cs_set_state_type state,
struct cs_index src)
{
cs_emit(b, SET_STATE, I) {
I.state = state;
I.source = cs_src32(b, src);
}
}
static inline void
cs_next_sb_entry(struct cs_builder *b, struct cs_index dest,
enum mali_cs_scoreboard_type sb_type,
enum mali_cs_next_sb_entry_format format)
{
cs_emit(b, NEXT_SB_ENTRY, I) {
I.destination = cs_dst32(b, dest);
I.sb_type = sb_type;
I.format = format;
}
}
/*
* Wait indirectly on a scoreboard (set via SET_STATE.SB_MASK_WAIT)
*/
static inline void
cs_wait_indirect(struct cs_builder *b)
{
cs_emit(b, WAIT, I) {
I.wait_mode = MALI_CS_WAIT_MODE_INDIRECT;
}
}
#endif
static inline void
cs_load_to(struct cs_builder *b, struct cs_index dest, struct cs_index address,
unsigned mask, int offset)
{
unsigned count = util_last_bit(mask);
unsigned base_reg = cs_dst_tuple(b, dest, count, mask);
cs_emit(b, LOAD_MULTIPLE, I) {
I.base_register = base_reg;
I.address = cs_src64(b, address);
I.mask = mask;
I.offset = offset;
}
for (unsigned i = 0; i < count; i++) {
if (mask & BITFIELD_BIT(i))
BITSET_SET(b->cur_ls_tracker->pending_loads, base_reg + i);
}
}
static inline void
cs_load32_to(struct cs_builder *b, struct cs_index dest,
struct cs_index address, int offset)
{
cs_load_to(b, dest, address, BITFIELD_MASK(1), offset);
}
static inline void
cs_load64_to(struct cs_builder *b, struct cs_index dest,
struct cs_index address, int offset)
{
cs_load_to(b, dest, address, BITFIELD_MASK(2), offset);
}
static inline void
cs_store(struct cs_builder *b, struct cs_index data, struct cs_index address,
unsigned mask, int offset)
{
unsigned count = util_last_bit(mask);
unsigned base_reg = cs_src_tuple(b, data, count, mask);
cs_emit(b, STORE_MULTIPLE, I) {
I.base_register = base_reg;
I.address = cs_src64(b, address);
I.mask = mask;
I.offset = offset;
}
for (unsigned i = 0; i < count; i++) {
b->cur_ls_tracker->pending_stores |= mask & BITFIELD_BIT(i);
}
}
static inline void
cs_store32(struct cs_builder *b, struct cs_index data, struct cs_index address,
int offset)
{
cs_store(b, data, address, BITFIELD_MASK(1), offset);
}
static inline void
cs_store64(struct cs_builder *b, struct cs_index data, struct cs_index address,
int offset)
{
cs_store(b, data, address, BITFIELD_MASK(2), offset);
}
#if PAN_ARCH < 11
/*
* Select which scoreboard entry will track endpoint tasks and other tasks
* respectively. Pass to cs_wait to wait later.
*/
static inline void
cs_set_scoreboard_entry(struct cs_builder *b, unsigned ep, unsigned other)
{
assert(ep < CS_MAX_SB_COUNT && "invalid slot");
assert(other < CS_MAX_SB_COUNT && "invalid slot");
cs_emit(b, SET_SB_ENTRY, I) {
I.endpoint_entry = ep;
I.other_entry = other;
}
/* We assume the load/store scoreboard entry is static to keep things
* simple. */
assert(b->conf.ls_sb_slot == other);
}
#else
static inline void
cs_set_state_imm32(struct cs_builder *b, enum mali_cs_set_state_type state,
unsigned value)
{
cs_emit(b, SET_STATE_IMM32, I) {
I.state = state;
I.value = value;
}
/* We assume the load/store scoreboard entry is static to keep things
* simple. */
if (state == MALI_CS_SET_STATE_TYPE_SB_SEL_OTHER)
assert(b->conf.ls_sb_slot == value);
}
#endif
/*
* Select which scoreboard entry will track endpoint tasks.
* On v10, this also set other endpoint to SB0.
* Pass to cs_wait to wait later.
*/
static inline void
cs_select_sb_entries_for_async_ops(struct cs_builder *b, unsigned ep)
{
#if PAN_ARCH == 10
cs_set_scoreboard_entry(b, ep, 0);
#else
cs_set_state_imm32(b, MALI_CS_SET_STATE_TYPE_SB_SEL_ENDPOINT, ep);
#endif
}
static inline void
cs_set_exception_handler(struct cs_builder *b,
enum mali_cs_exception_type exception_type,
struct cs_index address, struct cs_index length)
{
cs_emit(b, SET_EXCEPTION_HANDLER, I) {
I.exception_type = exception_type;
I.address = cs_src64(b, address);
I.length = cs_src32(b, length);
}
}
static inline void
cs_call(struct cs_builder *b, struct cs_index address, struct cs_index length)
{
cs_emit(b, CALL, I) {
I.address = cs_src64(b, address);
I.length = cs_src32(b, length);
}
}
static inline void
cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length)
{
cs_emit(b, JUMP, I) {
I.address = cs_src64(b, address);
I.length = cs_src32(b, length);
}
}
enum cs_res_id {
CS_COMPUTE_RES = BITFIELD_BIT(0),
CS_FRAG_RES = BITFIELD_BIT(1),
CS_TILER_RES = BITFIELD_BIT(2),
CS_IDVS_RES = BITFIELD_BIT(3),
};
static inline void
cs_req_res(struct cs_builder *b, uint32_t res_mask)
{
cs_emit(b, REQ_RESOURCE, I) {
I.compute = res_mask & CS_COMPUTE_RES;
I.tiler = res_mask & CS_TILER_RES;
I.idvs = res_mask & CS_IDVS_RES;
I.fragment = res_mask & CS_FRAG_RES;
}
}
static inline void
cs_flush_caches(struct cs_builder *b, enum mali_cs_flush_mode l2,
enum mali_cs_flush_mode lsc,
enum mali_cs_other_flush_mode others, struct cs_index flush_id,
struct cs_async_op async)
{
cs_emit(b, FLUSH_CACHE2, I) {
I.l2_flush_mode = l2;
I.lsc_flush_mode = lsc;
I.other_flush_mode = others;
I.latest_flush_id = cs_src32(b, flush_id);
cs_apply_async(I, async);
}
}
#define CS_SYNC_OPS(__cnt_width) \
static inline void cs_sync##__cnt_width##_set( \
struct cs_builder *b, bool propagate_error, \
enum mali_cs_sync_scope scope, struct cs_index val, \
struct cs_index addr, struct cs_async_op async) \
{ \
cs_emit(b, SYNC_SET##__cnt_width, I) { \
I.error_propagate = propagate_error; \
I.scope = scope; \
I.data = cs_src##__cnt_width(b, val); \
I.address = cs_src64(b, addr); \
cs_apply_async(I, async); \
} \
} \
\
static inline void cs_sync##__cnt_width##_add( \
struct cs_builder *b, bool propagate_error, \
enum mali_cs_sync_scope scope, struct cs_index val, \
struct cs_index addr, struct cs_async_op async) \
{ \
cs_emit(b, SYNC_ADD##__cnt_width, I) { \
I.error_propagate = propagate_error; \
I.scope = scope; \
I.data = cs_src##__cnt_width(b, val); \
I.address = cs_src64(b, addr); \
cs_apply_async(I, async); \
} \
} \
\
static inline void cs_sync##__cnt_width##_wait( \
struct cs_builder *b, bool reject_error, enum mali_cs_condition cond, \
struct cs_index ref, struct cs_index addr) \
{ \
assert(cond == MALI_CS_CONDITION_LEQUAL || \
cond == MALI_CS_CONDITION_GREATER); \
cs_emit(b, SYNC_WAIT##__cnt_width, I) { \
I.error_reject = reject_error; \
I.condition = cond; \
I.data = cs_src##__cnt_width(b, ref); \
I.address = cs_src64(b, addr); \
} \
}
CS_SYNC_OPS(32)
CS_SYNC_OPS(64)
static inline void
cs_store_state(struct cs_builder *b, struct cs_index address, int offset,
enum mali_cs_state state, struct cs_async_op async)
{
cs_emit(b, STORE_STATE, I) {
I.offset = offset;
I.state = state;
I.address = cs_src64(b, address);
cs_apply_async(I, async);
}
}
static inline void
cs_prot_region(struct cs_builder *b, unsigned size)
{
cs_emit(b, PROT_REGION, I) {
I.size = size;
}
}
static inline void
cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task,
struct cs_shader_res_sel res_sel)
{
/* Staging regs */
cs_flush_loads(b);
cs_emit(b, RUN_COMPUTE_INDIRECT, I) {
I.workgroups_per_task = wg_per_task;
I.srt_select = res_sel.srt;
I.spd_select = res_sel.spd;
I.tsd_select = res_sel.tsd;
I.fau_select = res_sel.fau;
}
}
static inline void
cs_error_barrier(struct cs_builder *b)
{
cs_emit(b, ERROR_BARRIER, _)
;
}
static inline void
cs_heap_set(struct cs_builder *b, struct cs_index address)
{
cs_emit(b, HEAP_SET, I) {
I.address = cs_src64(b, address);
}
}
static inline void
cs_heap_operation(struct cs_builder *b, enum mali_cs_heap_operation operation,
struct cs_async_op async)
{
cs_emit(b, HEAP_OPERATION, I) {
I.operation = operation;
cs_apply_async(I, async);
}
}
static inline void
cs_vt_start(struct cs_builder *b, struct cs_async_op async)
{
cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, async);
}
static inline void
cs_vt_end(struct cs_builder *b, struct cs_async_op async)
{
cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, async);
}
static inline void
cs_frag_end(struct cs_builder *b, struct cs_async_op async)
{
cs_heap_operation(b, MALI_CS_HEAP_OPERATION_FRAGMENT_COMPLETED, async);
}
static inline void
cs_trace_point(struct cs_builder *b, struct cs_index regs,
struct cs_async_op async)
{
cs_emit(b, TRACE_POINT, I) {
I.base_register =
cs_src_tuple(b, regs, regs.size, (uint16_t)BITFIELD_MASK(regs.size));
I.register_count = regs.size;
cs_apply_async(I, async);
}
}
struct cs_match {
struct cs_block block;
struct cs_label break_label;
struct cs_block case_block;
struct cs_label next_case_label;
struct cs_index val;
struct cs_index scratch_reg;
struct cs_load_store_tracker case_ls_state;
struct cs_load_store_tracker ls_state;
struct cs_load_store_tracker *orig_ls_state;
bool default_emitted;
};
static inline struct cs_match *
cs_match_start(struct cs_builder *b, struct cs_match *match,
struct cs_index val, struct cs_index scratch_reg)
{
*match = (struct cs_match){
.val = val,
.scratch_reg = scratch_reg,
.orig_ls_state = b->cur_ls_tracker,
};
cs_block_start(b, &match->block);
cs_label_init(&match->break_label);
cs_label_init(&match->next_case_label);
return match;
}
static inline void
cs_match_case_ls_set(struct cs_builder *b, struct cs_match *match)
{
if (unlikely(match->orig_ls_state)) {
match->case_ls_state = *match->orig_ls_state;
b->cur_ls_tracker = &match->case_ls_state;
}
}
static inline void
cs_match_case_ls_get(struct cs_match *match)
{
if (unlikely(match->orig_ls_state)) {
BITSET_OR(match->ls_state.pending_loads,
match->case_ls_state.pending_loads,
match->ls_state.pending_loads);
match->ls_state.pending_stores |= match->case_ls_state.pending_stores;
}
}
static inline void
cs_match_case(struct cs_builder *b, struct cs_match *match, uint32_t id)
{
assert(!match->default_emitted || !"default case must be last");
if (match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS) {
cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
cs_undef());
cs_block_end(b, &match->case_block);
cs_match_case_ls_get(match);
cs_set_label(b, &match->next_case_label);
cs_label_init(&match->next_case_label);
}
if (id)
cs_add32(b, match->scratch_reg, match->val, -id);
cs_branch_label(b, &match->next_case_label, MALI_CS_CONDITION_NEQUAL,
id ? match->scratch_reg : match->val);
cs_match_case_ls_set(b, match);
cs_block_start(b, &match->case_block);
}
static inline void
cs_match_default(struct cs_builder *b, struct cs_match *match)
{
assert(match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS ||
!"default case requires at least one other case");
cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
cs_undef());
if (cs_cur_block(b) == &match->case_block) {
cs_block_end(b, &match->case_block);
cs_match_case_ls_get(match);
}
cs_set_label(b, &match->next_case_label);
cs_label_init(&match->next_case_label);
cs_match_case_ls_set(b, match);
cs_block_start(b, &match->case_block);
match->default_emitted = true;
}
static inline void
cs_match_end(struct cs_builder *b, struct cs_match *match)
{
if (cs_cur_block(b) == &match->case_block) {
cs_match_case_ls_get(match);
cs_block_end(b, &match->case_block);
}
if (unlikely(match->orig_ls_state)) {
if (!match->default_emitted) {
/* If we don't have a default, assume we don't handle all possible cases
* and the match load/store state with the original load/store state.
*/
BITSET_OR(match->orig_ls_state->pending_loads,
match->ls_state.pending_loads,
match->orig_ls_state->pending_loads);
match->orig_ls_state->pending_stores |= match->ls_state.pending_stores;
} else {
*match->orig_ls_state = match->ls_state;
}
b->cur_ls_tracker = match->orig_ls_state;
}
cs_set_label(b, &match->next_case_label);
cs_set_label(b, &match->break_label);
cs_block_end(b, &match->block);
}
#define cs_match(__b, __val, __scratch) \
for (struct cs_match __match_storage, \
*__match = cs_match_start(__b, &__match_storage, __val, __scratch); \
__match != NULL; cs_match_end(__b, &__match_storage), __match = NULL)
#define cs_case(__b, __ref) \
for (bool __case_defined = ({ \
cs_match_case(__b, __match, __ref); \
false; \
}); \
!__case_defined; __case_defined = true)
#define cs_default(__b) \
for (bool __default_defined = ({ \
cs_match_default(__b, __match); \
false; \
}); \
!__default_defined; __default_defined = true)
static inline void
cs_nop(struct cs_builder *b)
{
cs_emit(b, NOP, I)
;
}
struct cs_function_ctx {
struct cs_index ctx_reg;
unsigned dump_addr_offset;
};
struct cs_function {
struct cs_block block;
struct cs_dirty_tracker dirty;
struct cs_function_ctx ctx;
unsigned dump_size;
uint64_t address;
uint32_t length;
};
static inline struct cs_function *
cs_function_start(struct cs_builder *b,
struct cs_function *function,
struct cs_function_ctx ctx)
{
assert(cs_cur_block(b) == NULL);
assert(b->conf.dirty_tracker == NULL);
*function = (struct cs_function){
.ctx = ctx,
};
cs_block_start(b, &function->block);
b->conf.dirty_tracker = &function->dirty;
return function;
}
#define SAVE_RESTORE_MAX_OPS (256 / 16)
static inline void
cs_function_end(struct cs_builder *b,
struct cs_function *function)
{
struct cs_index ranges[SAVE_RESTORE_MAX_OPS];
uint16_t masks[SAVE_RESTORE_MAX_OPS];
unsigned num_ranges = 0;
uint32_t num_instrs =
util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
struct cs_index addr_reg = {
.type = CS_INDEX_REGISTER,
.size = 2,
.reg = (uint8_t) (b->conf.nr_registers - 2),
};
/* Manual cs_block_end() without an instruction flush. We do that to insert
* the preamble without having to move memory in b->blocks.instrs. The flush
* will be done after the preamble has been emitted. */
assert(cs_cur_block(b) == &function->block);
assert(function->block.next == NULL);
b->blocks.stack = NULL;
if (!num_instrs)
return;
/* Try to minimize number of load/store by grouping them */
unsigned nregs = b->conf.nr_registers - b->conf.nr_kernel_registers;
unsigned pos, last = 0;
BITSET_FOREACH_SET(pos, function->dirty.regs, nregs) {
unsigned range = MIN2(nregs - pos, 16);
unsigned word = BITSET_BITWORD(pos);
unsigned bit = pos % BITSET_WORDBITS;
unsigned remaining_bits = BITSET_WORDBITS - bit;
if (pos < last)
continue;
masks[num_ranges] = function->dirty.regs[word] >> bit;
if (remaining_bits < range)
masks[num_ranges] |= function->dirty.regs[word + 1] << remaining_bits;
masks[num_ranges] &= BITFIELD_MASK(range);
ranges[num_ranges] =
cs_reg_tuple(b, pos, util_last_bit(masks[num_ranges]));
num_ranges++;
last = pos + range;
}
function->dump_size = BITSET_COUNT(function->dirty.regs) * sizeof(uint32_t);
/* Make sure the current chunk is able to accommodate the block
* instructions as well as the preamble and postamble.
* Adding 4 instructions (2x wait_slot and the move for the address) as
* the move might actually be translated to two MOVE32 instructions. */
num_instrs += (num_ranges * 2) + 4;
/* Align things on a cache-line in case the buffer contains more than one
* function (64 bytes = 8 instructions). */
uint32_t padded_num_instrs = ALIGN_POT(num_instrs, 8);
if (!cs_reserve_instrs(b, padded_num_instrs))
return;
function->address =
b->cur_chunk.buffer.gpu + (b->cur_chunk.pos * sizeof(uint64_t));
/* Preamble: backup modified registers */
if (num_ranges > 0) {
unsigned offset = 0;
cs_load64_to(b, addr_reg, function->ctx.ctx_reg,
function->ctx.dump_addr_offset);
for (unsigned i = 0; i < num_ranges; ++i) {
unsigned reg_count = util_bitcount(masks[i]);
cs_store(b, ranges[i], addr_reg, masks[i], offset);
offset += reg_count * 4;
}
cs_flush_stores(b);
}
/* Now that the preamble is emitted, we can flush the instructions we have in
* our function block. */
cs_flush_block_instrs(b);
/* Postamble: restore modified registers */
if (num_ranges > 0) {
unsigned offset = 0;
cs_load64_to(b, addr_reg, function->ctx.ctx_reg,
function->ctx.dump_addr_offset);
for (unsigned i = 0; i < num_ranges; ++i) {
unsigned reg_count = util_bitcount(masks[i]);
cs_load_to(b, ranges[i], addr_reg, masks[i], offset);
offset += reg_count * 4;
}
cs_flush_loads(b);
}
/* Fill the rest of the buffer with NOPs. */
for (; num_instrs < padded_num_instrs; num_instrs++)
cs_nop(b);
function->length = padded_num_instrs;
}
#define cs_function_def(__b, __function, __ctx) \
for (struct cs_function *__tmp = cs_function_start(__b, __function, __ctx); \
__tmp != NULL; \
cs_function_end(__b, __function), __tmp = NULL)
struct cs_tracing_ctx {
bool enabled;
struct cs_index ctx_reg;
unsigned tracebuf_addr_offset;
};
static inline void
cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, unsigned trace_size)
{
assert(trace_size > 0 && ALIGN_POT(trace_size, 64) == trace_size &&
trace_size < INT16_MAX);
assert(scratch_regs.size >= 4 && !(scratch_regs.reg & 1));
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
/* We always update the tracebuf position first, so we can easily detect OOB
* access. Use cs_trace_field_offset() to get an offset taking this
* pre-increment into account. */
cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size);
cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
cs_flush_stores(b);
}
#define cs_trace_field_offset(__type, __field) \
(int16_t)(offsetof(struct cs_##__type##_trace, __field) - \
sizeof(struct cs_##__type##_trace))
struct cs_run_fragment_trace {
uint64_t ip;
uint32_t sr[7];
} __attribute__((aligned(64)));
static inline void
cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, bool enable_tem,
enum mali_tile_render_order tile_order)
{
if (likely(!ctx->enabled)) {
cs_run_fragment(b, enable_tem, tile_order);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs,
sizeof(struct cs_run_fragment_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_fragment(b, enable_tem, tile_order);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment, ip));
cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7),
cs_trace_field_offset(run_fragment, sr));
cs_flush_stores(b);
}
#if PAN_ARCH >= 12
struct cs_run_idvs2_trace {
uint64_t ip;
uint32_t draw_id;
uint32_t pad;
uint32_t sr[66];
} __attribute__((aligned(64)));
static inline void
cs_trace_run_idvs2(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, uint32_t flags_override,
bool malloc_enable, struct cs_index draw_id,
enum mali_idvs_shading_mode vertex_shading_mode)
{
if (likely(!ctx->enabled)) {
cs_run_idvs2(b, flags_override, malloc_enable, draw_id,
vertex_shading_mode);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs, sizeof(struct cs_run_idvs2_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_idvs2(b, flags_override, malloc_enable, draw_id, vertex_shading_mode);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs2, ip));
if (draw_id.type != CS_INDEX_UNDEF)
cs_store32(b, draw_id, tracebuf_addr,
cs_trace_field_offset(run_idvs2, draw_id));
for (unsigned i = 0; i < 64; i += 16)
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
cs_trace_field_offset(run_idvs2, sr[0]) + i * sizeof(uint32_t));
cs_store(b, cs_reg_tuple(b, 64, 2), tracebuf_addr, BITFIELD_MASK(2),
cs_trace_field_offset(run_idvs2, sr[64]));
cs_flush_stores(b);
}
#else
struct cs_run_idvs_trace {
uint64_t ip;
uint32_t draw_id;
uint32_t pad;
uint32_t sr[61];
} __attribute__((aligned(64)));
static inline void
cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, uint32_t flags_override,
bool malloc_enable, struct cs_shader_res_sel varying_sel,
struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
{
if (likely(!ctx->enabled)) {
cs_run_idvs(b, flags_override, malloc_enable, varying_sel, frag_sel,
draw_id);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs, sizeof(struct cs_run_idvs_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_idvs(b, flags_override, malloc_enable, varying_sel, frag_sel,
draw_id);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs, ip));
if (draw_id.type != CS_INDEX_UNDEF)
cs_store32(b, draw_id, tracebuf_addr,
cs_trace_field_offset(run_idvs, draw_id));
for (unsigned i = 0; i < 48; i += 16)
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
cs_trace_field_offset(run_idvs, sr[0]) + i * sizeof(uint32_t));
cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13),
cs_trace_field_offset(run_idvs, sr[48]));
cs_flush_stores(b);
}
#endif
struct cs_run_compute_trace {
uint64_t ip;
uint32_t sr[40];
} __attribute__((aligned(64)));
static inline void
cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, unsigned task_increment,
enum mali_task_axis task_axis,
struct cs_shader_res_sel res_sel)
{
if (likely(!ctx->enabled)) {
cs_run_compute(b, task_increment, task_axis, res_sel);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs, sizeof(struct cs_run_compute_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_compute(b, task_increment, task_axis, res_sel);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
for (unsigned i = 0; i < 32; i += 16)
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
cs_trace_field_offset(run_compute, sr[0]) + i * sizeof(uint32_t));
cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
cs_trace_field_offset(run_compute, sr[32]));
cs_flush_stores(b);
}
static inline void
cs_trace_run_compute_indirect(struct cs_builder *b,
const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs,
unsigned wg_per_task,
struct cs_shader_res_sel res_sel)
{
if (likely(!ctx->enabled)) {
cs_run_compute_indirect(b, wg_per_task, res_sel);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs, sizeof(struct cs_run_compute_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_compute_indirect(b, wg_per_task, res_sel);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
for (unsigned i = 0; i < 32; i += 16)
cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
cs_trace_field_offset(run_compute, sr[0]) + i * sizeof(uint32_t));
cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
cs_trace_field_offset(run_compute, sr[32]));
cs_flush_stores(b);
}
#define cs_single_link_list_for_each_from(b, current, node_type, base_name) \
cs_while(b, MALI_CS_CONDITION_NEQUAL, current) \
for (bool done = false; !done; \
cs_load64_to(b, current, current, \
offsetof(node_type, base_name.next)), \
done = true)
/**
* Append an item to a cs_single_link_list.
*
* @param b The current cs_builder.
* @param list_base CS register with the base address for the list pointer.
* @param list_offset Offset added to list_base.
* @param new_node_gpu GPU address of the node to insert.
* @param base_offset Offset of cs_single_link_list_node in the new node.
* @param head_tail Temporary register pair used in the function.
*/
static inline void
cs_single_link_list_add_tail(struct cs_builder *b, struct cs_index list_base,
int list_offset, struct cs_index new_node_gpu,
int base_offset, struct cs_index head_tail)
{
assert(head_tail.size == 4);
assert(head_tail.reg % 2 == 0);
STATIC_ASSERT(offsetof(struct cs_single_link_list, tail) ==
offsetof(struct cs_single_link_list, head) + sizeof(uint64_t));
struct cs_index head = cs_reg64(b, head_tail.reg);
struct cs_index tail = cs_reg64(b, head_tail.reg + 2);
/* Offset of the next pointer inside the node pointed to by new_node_gpu. */
const int offset_next =
base_offset + offsetof(struct cs_single_link_list_node, next);
STATIC_ASSERT(offsetof(struct cs_single_link_list, head) == 0);
cs_load_to(b, head_tail, list_base, BITFIELD_MASK(4), list_offset);
/* If the list is empty (head == NULL), set the head, otherwise append to the
* last node. */
cs_if(b, MALI_CS_CONDITION_EQUAL, head)
cs_add64(b, head, new_node_gpu, 0);
cs_else(b)
cs_store64(b, new_node_gpu, tail, offset_next);
cs_add64(b, tail, new_node_gpu, 0);
cs_store(b, head_tail, list_base, BITFIELD_MASK(4), list_offset);
cs_flush_stores(b);
}
#ifdef __cplusplus
}
#endif