Merge branch 'panfrost-v15' into 'main'

Draft: panfrost: Add v15 support

See merge request mesa/mesa!41366
This commit is contained in:
Lars-Ivar Hesselberg Simonsen 2026-05-08 02:09:53 +02:00
commit 7731477125
92 changed files with 10165 additions and 769 deletions

View file

@ -34,6 +34,10 @@ The following hardware is currently supported:
+--------------------+---------------+-----------+--------+--------+
| G725 | 5th Gen (v13) | 3.1 | 3.1 | 1.4 |
+--------------------+---------------+-----------+--------+--------+
| G1-Pro | 5th Gen (v14) | 3.1 | 3.1 | 1.4 |
+--------------------+---------------+-----------+--------+--------+
| TMAx | 5th Gen (v15) | 3.1 | 3.1 | 1.4 |
+--------------------+---------------+-----------+--------+--------+
Other Midgard and Bifrost chips (e.g. G71) are not yet supported.

View file

@ -350,7 +350,7 @@ struct drm_panthor_gpu_info {
__u32 as_present;
/**
* @select_coherency: Coherency selected for this device.
* @selected_coherency: Coherency selected for this device.
*
* One of drm_panthor_gpu_coherency.
*/
@ -368,11 +368,27 @@ struct drm_panthor_gpu_info {
/** @core_features: Used to discriminate core variants when they exist. */
__u32 core_features;
/** @pad: MBZ. */
__u32 pad;
/** @thread_num_active_granularity: Granularity of number of active threads */
__u32 thread_num_active_granularity;
/** @gpu_features: Bitmask describing supported GPU-wide features */
__u64 gpu_features;
/** @gpu_wide_id: 64-bit GPU_ID for v15 onwards. */
__u64 gpu_wide_id;
#define DRM_PANTHOR_WIDE_ARCH_MAJOR(x) (((x) >> 56) & 0xff)
#define DRM_PANTHOR_WIDE_ARCH_MINOR(x) (((x) >> 48) & 0xff)
#define DRM_PANTHOR_WIDE_ARCH_REV(x) (((x) >> 40) & 0xff)
#define DRM_PANTHOR_WIDE_PRODUCT_MAJOR(x) (((x) >> 32) & 0xff)
#define DRM_PANTHOR_WIDE_VERSION_MAJOR(x) (((x) >> 16) & 0xff)
#define DRM_PANTHOR_WIDE_VERSION_MINOR(x) (((x) >> 8) & 0xff)
#define DRM_PANTHOR_WIDE_VERSION_STATUS(x) ((x) & 0xff)
/** @gpu_rev_wide: 64-bit GPU revision for v15 onwards */
__u64 gpu_rev_wide;
/** @l2_features_wide: 64-bit L2_FEATURES for v15 onwards */
__u64 l2_features_wide;
};
/**
@ -409,6 +425,38 @@ struct drm_panthor_csif_info {
__u32 pad;
};
/**
* enum drm_panthor_timestamp_info_flags - drm_panthor_timestamp_info.flags
*/
enum drm_panthor_timestamp_info_flags {
/** @DRM_PANTHOR_TIMESTAMP_GPU: Query GPU time. */
DRM_PANTHOR_TIMESTAMP_GPU = 1 << 0,
/** @DRM_PANTHOR_TIMESTAMP_CPU_NONE: Don't query CPU time. */
DRM_PANTHOR_TIMESTAMP_CPU_NONE = 0 << 1,
/** @DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC: Query CPU time using CLOCK_MONOTONIC. */
DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC = 1 << 1,
/** @DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW: Query CPU time using CLOCK_MONOTONIC_RAW. */
DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW = 2 << 1,
/** @DRM_PANTHOR_TIMESTAMP_CPU_TYPE_MASK: Space reserved for CPU clock type. */
DRM_PANTHOR_TIMESTAMP_CPU_TYPE_MASK = 7 << 1,
/** @DRM_PANTHOR_TIMESTAMP_GPU_OFFSET: Query GPU offset. */
DRM_PANTHOR_TIMESTAMP_GPU_OFFSET = 1 << 4,
/** @DRM_PANTHOR_TIMESTAMP_GPU_CYCLE_COUNT: Query GPU cycle count. */
DRM_PANTHOR_TIMESTAMP_GPU_CYCLE_COUNT = 1 << 5,
/** @DRM_PANTHOR_TIMESTAMP_FREQ: Query timestamp frequency. */
DRM_PANTHOR_TIMESTAMP_FREQ = 1 << 6,
/** @DRM_PANTHOR_TIMESTAMP_DURATION: Return duration of time query. */
DRM_PANTHOR_TIMESTAMP_DURATION = 1 << 7,
};
/**
* struct drm_panthor_timestamp_info - Timestamp information
*
@ -421,11 +469,38 @@ struct drm_panthor_timestamp_info {
*/
__u64 timestamp_frequency;
/** @current_timestamp: The current timestamp. */
/** @current_timestamp: The current GPU timestamp. */
__u64 current_timestamp;
/** @timestamp_offset: The offset of the timestamp timer. */
/** @timestamp_offset: The offset of the GPU timestamp timer. */
__u64 timestamp_offset;
/**
* @flags: Bitmask of drm_panthor_timestamp_info_flags.
*
* If set to 0, then it is interpreted as:
* DRM_PANTHOR_TIMESTAMP_GPU |
* DRM_PANTHOR_TIMESTAMP_GPU_OFFSET |
* DRM_PANTHOR_TIMESTAMP_FREQ
*
* Note: these flags are exclusive to each other (only one can be used):
* - DRM_PANTHOR_TIMESTAMP_CPU_NONE
* - DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC
* - DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW
*/
__u32 flags;
/** @duration_nsec: Duration of time query. */
__u32 duration_nsec;
/** @cycle_count: Value of GPU_CYCLE_COUNT. */
__u64 cycle_count;
/** @cpu_timestamp_sec: Seconds part of CPU timestamp. */
__u64 cpu_timestamp_sec;
/** @cpu_timestamp_nsec: Nanseconds part of CPU timestamp. */
__u64 cpu_timestamp_nsec;
};
/**

View file

@ -41,7 +41,7 @@ compile_args_panfrost = [
'-Wno-pointer-arith'
]
panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13']
panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
libpanfrost_versions = []
foreach ver : panfrost_versions
@ -54,7 +54,7 @@ foreach ver : panfrost_versions
]
if ver in ['4', '5', '6', '7', '9']
files_panfrost_vx += ['pan_jm.c']
elif ver in ['10', '12', '13']
elif ver in ['10', '12', '13', '14', '15']
files_panfrost_vx += ['pan_csf.c']
endif
libpanfrost_versions += static_library(

View file

@ -49,7 +49,7 @@
* functions. */
#if PAN_ARCH <= 9
#define JOBX(__suffix) GENX(jm_##__suffix)
#elif PAN_ARCH <= 13
#elif PAN_ARCH <= 15
#define JOBX(__suffix) GENX(csf_##__suffix)
#else
#error "Unsupported arch"
@ -1661,7 +1661,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
.tls.size = ss->info.tls_size,
.wls.size = ss->info.wls_size + grid->variable_shared_mem,
.wls.instances = pan_calc_wls_instances(
&local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim),
&local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim,
ss->info.work_reg_count),
};
if (ss->info.tls_size) {
@ -4455,11 +4456,15 @@ prepare_shader(struct panfrost_compiled_shader *state,
else if (vs)
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
#endif
#if PAN_ARCH >= 15
cfg.register_count = state->info.work_reg_count;
cfg.preload.r0_r15 = state->info.preload;
#else
cfg.register_allocation =
pan_register_allocation(state->info.work_reg_count);
cfg.binary = state->bin.gpu;
cfg.preload.r48_r63 = (state->info.preload >> 48);
#endif
cfg.binary = state->bin.gpu;
cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
@ -4475,10 +4480,15 @@ prepare_shader(struct panfrost_compiled_shader *state,
#if PAN_ARCH < 12
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
#endif
#if PAN_ARCH >= 15
cfg.register_count = state->info.work_reg_count;
cfg.preload.r0_r15 = state->info.preload;
#else
cfg.register_allocation =
pan_register_allocation(state->info.work_reg_count);
cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
cfg.preload.r48_r63 = (state->info.preload >> 48);
#endif
cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
}

View file

@ -1,5 +1,6 @@
/*
* Copyright (C) 2023 Collabora Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
@ -13,6 +14,7 @@
#include "pan_cmdstream.h"
#include "pan_context.h"
#include "pan_csf.h"
#include "pan_fb.h"
#include "pan_fb_preload.h"
#include "pan_job.h"
#include "pan_trace.h"
@ -75,6 +77,99 @@ csf_update_tiler_oom_ctx(struct cs_builder *b, uint64_t addr)
(PAN_INCREMENTAL_RENDERING_##_pass##_PASS * sizeof(struct pan_ptr)) + \
offsetof(struct pan_ptr, gpu))
#if PAN_ARCH >= 14
static void
cs_emit_static_fragment_state(struct cs_builder *b,
struct panfrost_batch *batch,
const struct pan_fb_info *fb)
{
struct mali_fragment_bounding_box_packed bbox;
pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) {
cfg.bound_min_x = batch->minx;
cfg.bound_min_y = batch->miny;
cfg.bound_max_x = batch->maxx - 1;
cfg.bound_max_y = batch->maxy - 1;
}
struct mali_frame_size_packed frame_size;
pan_pack(&frame_size, FRAME_SIZE, cfg) {
cfg.width = fb->width;
cfg.height = fb->height;
}
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX),
bbox.opaque[0] | ((uint64_t)bbox.opaque[1] << 32));
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]);
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER),
fb->sample_positions);
struct mali_fragment_flags_1_packed flags1;
pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) {
/* The force_samples setting dictates the sample-count that is used
* for rasterization, and works like D3D11's ForcedSampleCount
* feature:
*
* - If force_samples == 0: Let nr_samples dictate sample count
* - If force_samples == 1: force single-sampled rasterization
* - If force_samples >= 1: force multi-sampled rasterization
*
* This can be used to read SYSTEM_VALUE_SAMPLE_MASK_IN from the
* fragment shader, even when performing single-sampled rendering.
*/
if (fb->pls_enabled) {
cfg.sample_count = 4;
cfg.sample_pattern = pan_sample_pattern(1);
} else if (!fb->force_samples) {
cfg.sample_count = fb->nr_samples;
cfg.sample_pattern = pan_sample_pattern(fb->nr_samples);
} else if (fb->force_samples == 1) {
cfg.sample_count = fb->nr_samples;
cfg.sample_pattern = pan_sample_pattern(1);
} else {
cfg.sample_count = 1;
cfg.sample_pattern = pan_sample_pattern(fb->force_samples);
}
cfg.effective_tile_size = fb->tile_size;
cfg.point_sprite_coord_origin_max_y = fb->sprite_coord_origin;
cfg.first_provoking_vertex = fb->first_provoking_vertex;
cfg.render_target_count = MAX2(fb->rt_count, 1);
cfg.color_buffer_allocation = fb->cbuf_allocation;
}
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]);
/* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */
}
#define PAN_CS_REG_FBD_LAYER_PTR 54
static inline void
cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr)
{
/* Emit the dynamic fragment state. This state may change per-layer. */
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr,
offsetof(struct pan_fbd_layer, flags0));
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr,
offsetof(struct pan_fbd_layer, flags2));
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr,
offsetof(struct pan_fbd_layer, z_clear));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, tiler));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, rtd_pointer));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, dbd_pointer));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr,
offsetof(struct pan_fbd_layer, frame_argument));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, dcd_pointer));
cs_flush_loads(b);
}
#endif /* PAN_ARCH >= 14 */
static int
csf_oom_handler_init(struct panfrost_context *ctx)
{
@ -113,13 +208,18 @@ csf_oom_handler_init(struct panfrost_context *ctx)
cs_function_def(&b, &handler, handler_ctx) {
struct cs_index tiler_oom_ctx = cs_reg64(&b, TILER_OOM_CTX_REG);
struct cs_index counter = cs_reg32(&b, 47);
struct cs_index zero = cs_reg64(&b, 48);
struct cs_index flush_id = cs_reg32(&b, 48);
struct cs_index tiler_ctx = cs_reg64(&b, 50);
struct cs_index completed_top = cs_reg64(&b, 52);
struct cs_index completed_bottom = cs_reg64(&b, 54);
struct cs_index completed_chunks = cs_reg_tuple(&b, 52, 4);
struct cs_index counter = cs_reg32(&b, 31);
struct cs_index zero = cs_reg64(&b, 56);
struct cs_index flush_id = cs_reg32(&b, 58);
struct cs_index tiler_ctx = cs_reg64(&b, 60);
struct cs_index completed_top = cs_reg64(&b, 64);
struct cs_index completed_bottom = cs_reg64(&b, 66);
struct cs_index completed_chunks = cs_reg_tuple(&b, 64, 4);
#if PAN_ARCH >= 14
struct cs_index fbd_pointer = cs_reg64(&b, PAN_CS_REG_FBD_LAYER_PTR);
#else
struct cs_index fbd_pointer = cs_sr_reg64(&b, FRAGMENT, FBD_POINTER);
#endif
/* Ensure that the OTHER endpoint is valid */
#if PAN_ARCH >= 11
@ -133,25 +233,31 @@ csf_oom_handler_init(struct panfrost_context *ctx)
cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter));
cs_wait_slot(&b, 0);
cs_if(&b, MALI_CS_CONDITION_GREATER, counter) {
cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx,
FBD_OFFSET(MIDDLE));
cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(MIDDLE));
}
cs_else(&b) {
cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx,
FBD_OFFSET(FIRST));
cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(FIRST));
}
#if PAN_ARCH >= 14
cs_emit_layer_fragment_state(&b, fbd_pointer);
#else
cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MIN), tiler_oom_ctx,
FIELD_OFFSET(bbox_min));
cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MAX), tiler_oom_ctx,
FIELD_OFFSET(bbox_max));
cs_move64_to(&b, cs_sr_reg64(&b, FRAGMENT, TEM_POINTER), 0);
cs_move32_to(&b, cs_sr_reg32(&b, FRAGMENT, TEM_ROW_STRIDE), 0);
#endif
cs_wait_slot(&b, 0);
/* Run the fragment job and wait */
cs_select_endpoint_sb(&b, 3);
#if PAN_ARCH >= 14
cs_run_fragment2(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
#else
cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
#endif
cs_wait_slot(&b, 3);
/* Increment counter */
@ -218,6 +324,21 @@ GENX(csf_cleanup_batch)(struct panfrost_batch *batch)
panfrost_pool_cleanup(&batch->csf.cs_chunk_pool);
}
#if PAN_ARCH >= 14
static inline struct pan_ptr
alloc_fbd(struct panfrost_batch *batch)
{
const struct pan_desc_alloc_info fbd_layer = {
.size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64),
.align = alignof(struct pan_fbd_layer),
.nelems = 1,
};
return pan_pool_alloc_desc_aggregate(
&batch->pool.base, fbd_layer, PAN_DESC(ZS_CRC_EXTENSION),
PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
}
#else
static inline struct pan_ptr
alloc_fbd(struct panfrost_batch *batch)
{
@ -225,6 +346,7 @@ alloc_fbd(struct panfrost_batch *batch)
&batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
}
#endif /* PAN_ARCH >= 14 */
int
GENX(csf_init_batch)(struct panfrost_batch *batch)
@ -758,7 +880,7 @@ GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
(_ctx)->fbds[PAN_INCREMENTAL_RENDERING_##_pass##_PASS]
#define EMIT_FBD(_ctx, _pass, _fb, _tls, _tiler_ctx) \
GET_FBD(_ctx, _pass).gpu |= \
GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass).cpu)
GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass))
void
GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
@ -771,7 +893,7 @@ GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
/* Default framebuffer descriptor */
batch->framebuffer.gpu |=
GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer);
if (batch->draw_count == 0)
return;
@ -854,15 +976,26 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
cs_vt_end(b, cs_now());
}
#if PAN_ARCH >= 14
struct cs_index fbd_pointer = cs_reg64(b, PAN_CS_REG_FBD_LAYER_PTR);
#else
struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
#endif
/* Set up the fragment job */
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
batch->framebuffer.gpu);
cs_move64_to(b, fbd_pointer, batch->framebuffer.gpu);
#if PAN_ARCH >= 14
cs_emit_static_fragment_state(b, batch, pfb);
cs_emit_layer_fragment_state(b, fbd_pointer);
#else
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
(batch->miny << 16) | batch->minx);
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX),
((batch->maxy - 1) << 16) | (batch->maxx - 1));
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, TEM_POINTER), 0);
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, TEM_ROW_STRIDE), 0);
#endif
/* Use different framebuffer descriptor if incremental rendering was
* triggered while tiling */
@ -871,13 +1004,19 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
cs_load32_to(b, counter, cs_reg64(b, TILER_OOM_CTX_REG), 0);
cs_wait_slot(b, 0);
cs_if(b, MALI_CS_CONDITION_GREATER, counter) {
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
GET_FBD(oom_ctx, LAST).gpu);
cs_move64_to(b, fbd_pointer, GET_FBD(oom_ctx, LAST).gpu);
#if PAN_ARCH >= 14
cs_emit_layer_fragment_state(b, fbd_pointer);
#endif
}
}
/* Run the fragment job and wait */
#if PAN_ARCH >= 14
cs_run_fragment2(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
#else
cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
#endif
cs_wait_slot(b, 2);
/* Gather freed heap chunks and add them to the heap context free list

View file

@ -1105,9 +1105,14 @@ pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
#if PAN_ARCH >= 15
cfg.register_count = preload_shader->info.work_reg_count;
cfg.preload.r0_r15 = preload_shader->info.preload;
#else
cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
cfg.binary = preload_shader->address;
cfg.preload.r48_r63 = preload_shader->info.preload >> 48;
#endif
cfg.binary = preload_shader->address;
}
unsigned bd_count = views.rt_count;

View file

@ -257,8 +257,8 @@ GENX(jm_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
{
PAN_TRACE_FUNC(PAN_TRACE_GL_JM);
batch->framebuffer.gpu |= GENX(pan_emit_fbd)(
fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
batch->framebuffer.gpu |=
GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer);
}
void

View file

@ -98,10 +98,15 @@ panfrost_precomp_shader_create(
pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
cfg.stage = pan_shader_stage(&res->info);
#if PAN_ARCH >= 15
cfg.register_count = res->info.work_reg_count;
cfg.preload.r0_r15 = res->info.preload;
#else
cfg.register_allocation =
pan_register_allocation(res->info.work_reg_count);
cfg.binary = res->code_ptr;
cfg.preload.r48_r63 = (res->info.preload >> 48);
#endif
cfg.binary = res->code_ptr;
cfg.flush_to_zero_mode = panfrost_ftz_mode(&res->info);
}
@ -197,8 +202,9 @@ emit_tls(struct panfrost_batch *batch,
struct pan_tls_info info = {
.tls.size = shader->info.tls_size,
.wls.size = shader->info.wls_size,
.wls.instances = pan_calc_wls_instances(&shader->local_size,
&dev->kmod.dev->props, dim),
.wls.instances =
pan_calc_wls_instances(&shader->local_size, &dev->kmod.dev->props, dim,
shader->info.work_reg_count),
};
if (info.tls.size) {
@ -325,7 +331,17 @@ GENX(panfrost_launch_precomp)(struct panfrost_batch *batch,
uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_0), fau_ptr);
#if PAN_ARCH >= 15
struct mali_shader_program_pointer_packed spp;
pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
ctx.register_count = shader->info.work_reg_count;
ctx.pointer = shader->state_ptr;
}
uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), ptr);
#else
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), shader->state_ptr);
#endif
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, TSD_0), tsd);
/* Global attribute offset */

View file

@ -1175,6 +1175,12 @@ panfrost_create_screen(int fd, const struct pipe_screen_config *config,
case 13:
panfrost_cmdstream_screen_init_v13(screen);
break;
case 14:
panfrost_cmdstream_screen_init_v14(screen);
break;
case 15:
panfrost_cmdstream_screen_init_v15(screen);
break;
default:
debug_printf("panfrost: Unhandled architecture major %d", dev->arch);
panfrost_destroy_screen(&(screen->base));

View file

@ -155,6 +155,8 @@ void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen);
void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen);
void panfrost_cmdstream_screen_init_v12(struct panfrost_screen *screen);
void panfrost_cmdstream_screen_init_v13(struct panfrost_screen *screen);
void panfrost_cmdstream_screen_init_v14(struct panfrost_screen *screen);
void panfrost_cmdstream_screen_init_v15(struct panfrost_screen *screen);
#define perf_debug(ctx, ...) \
do { \

View file

@ -10,6 +10,7 @@
#include "panfrost/compiler/bifrost/bifrost_compile.h"
#include "panfrost/compiler/pan_compiler.h"
#include "panfrost/compiler/pan_nir.h"
#include "panfrost/model/pan_model.h"
#include "nir.h"
#include "nir_builder.h"
#include "nir_builder_opcodes.h"
@ -275,7 +276,7 @@ main(int argc, const char **argv)
unsigned target_arch = atoi(target_arch_str);
if (target_arch < 4 || target_arch > 13) {
if (target_arch < 4 || target_arch > 15) {
fprintf(stderr, "Unsupported target arch %d\n", target_arch);
return 1;
}
@ -353,7 +354,12 @@ main(int argc, const char **argv)
libfunc, MESA_SHADER_COMPUTE, v, get_compiler_options(target_arch),
&opt, load_kernel_input);
uint64_t target_gpu_id = (target_arch & 0xf) << 28;
uint64_t target_gpu_id;
if (target_arch >= PAN_ID64_COMPAT)
target_gpu_id =
((uint64_t)(target_arch & 0xff) << 56) | (PAN_ID64_COMPAT << 28);
else
target_gpu_id = (target_arch & 0xf) << 28;
struct pan_compile_inputs inputs = {
.gpu_id = target_gpu_id,

View file

@ -16,14 +16,14 @@
*/
static uint32_t
va_op_swizzles(enum bi_opcode op, unsigned src)
va_op_swizzles(enum bi_opcode op, unsigned src, unsigned arch)
{
/* This is a bifrost-only instruction that is lowered on valhall */
if (!valhall_opcodes[op].exact)
if (!get_valhall_opcode(op, arch).exact)
return bi_op_swizzles[op][src];
uint32_t swizzles = 0;
struct va_src_info info = va_src_info(op, src);
struct va_src_info info = va_src_info(op, src, arch);
if (info.swizzle) {
assert(info.size == VA_SIZE_16 || info.size == VA_SIZE_32);
@ -99,8 +99,8 @@ bool
bi_op_supports_swizzle(enum bi_opcode op, unsigned src,
enum bi_swizzle swizzle, unsigned arch)
{
uint32_t supported_swizzles = arch >= 9 ?
va_op_swizzles(op, src) : bi_op_swizzles[op][src];
uint32_t supported_swizzles =
arch >= 9 ? va_op_swizzles(op, src, arch) : bi_op_swizzles[op][src];
return supported_swizzles & BITFIELD_BIT(swizzle);
}

View file

@ -294,7 +294,8 @@ bi_compute_liveness_ra(bi_context *ctx)
#define EVEN_BITS_MASK (0x5555555555555555ull)
static uint64_t
bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
bi_make_affinity(uint64_t clobber, unsigned count, bool split_file,
unsigned arch)
{
uint64_t clobbered = 0;
@ -308,12 +309,12 @@ bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
clobbered |= mask << (64 - excess);
if (split_file)
clobbered |= mask << (16 - excess);
clobbered |= mask << (((arch >= 15) ? 32 : 16) - excess);
}
/* Don't allocate the middle if we split out the middle */
if (split_file)
clobbered |= BITFIELD64_MASK(32) << 16;
clobbered |= BITFIELD64_MASK(32) << ((arch >= 15) ? 32 : 16);
/* We can use a register iff it's not clobberred */
return ~clobbered;
@ -341,7 +342,7 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
unsigned count = bi_count_write_registers(ins, d);
unsigned offset = ins->dest[d].offset;
uint64_t affinity =
bi_make_affinity(preload_live, count, split_file) >> offset;
bi_make_affinity(preload_live, count, split_file, arch) >> offset;
/* Valhall needs >= 64-bit staging writes to be pair-aligned */
if (aligned_sr && (count >= 2 || offset))
affinity &= EVEN_BITS_MASK;
@ -381,8 +382,8 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
bi_foreach_ssa_src(ins, s) {
if (bi_count_read_registers(ins, s) >= 2)
l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
else if (s < valhall_opcodes[ins->op].nr_srcs &&
va_src_info(ins->op, s).size > VA_SIZE_32)
else if (s < get_valhall_opcode(ins->op, arch).nr_srcs &&
va_src_info(ins->op, s, arch).size > VA_SIZE_32)
l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
}
}
@ -435,7 +436,8 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
uint64_t default_affinity =
ctx->inputs->is_blend ? BITFIELD64_MASK(16)
: full_regs ? BITFIELD64_MASK(64)
: (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
: (ctx->arch >= 15) ? BITFIELD64_MASK(32)
: (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
/* To test spilling, mimic a small register file */
if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend && (bifrost_debug & BIFROST_DBG_NOSSARA))

View file

@ -703,8 +703,10 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr)
assert(intr->intrinsic == nir_intrinsic_load_var_buf_pan ||
intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan);
const unsigned arch = b->shader->arch;
/* These are only available on Valhall+ */
assert(b->shader->arch >= 9);
assert(arch >= 9);
const bool flat = intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan;
const nir_alu_type src_type = nir_intrinsic_src_type(intr);
@ -757,19 +759,36 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr)
bool use_imm_form = false;
if (nir_src_is_const(intr->src[0])) {
imm_offset = nir_src_as_uint(intr->src[0]);
assert(imm_offset < pan_ld_var_buf_off_size(b->shader->arch));
assert(imm_offset < pan_ld_var_buf_off_size(arch));
use_imm_form = true;
}
/* On v14+, flat source formats are removed from LD_VAR_BUF/LD_VAR_BUF_IMM,
* so flat buffer varyings must use the dedicated LD_VAR_BUF_FLAT*.
*/
if (use_imm_form) {
bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
if (arch >= 14 && flat) {
bi_ld_var_buf_flat_imm_to(b, dest, regfmt, vecsize, imm_offset);
} else {
bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
BI_UPDATE_STORE, vecsize, imm_offset);
}
} else {
bi_index offset = bi_src_index(&intr->src[0]);
bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample,
source_format, BI_UPDATE_STORE, vecsize);
if (arch >= 14 && flat) {
bi_ld_var_buf_flat_to(b, dest, offset, regfmt, vecsize);
} else {
bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample,
source_format, BI_UPDATE_STORE, vecsize);
}
}
/* LD_VAR_BUF_FLAT* only support register formats F16 and F32. */
assert(
arch < 14 || !flat ||
(regfmt == BI_REGISTER_FORMAT_F16 || regfmt == BI_REGISTER_FORMAT_F32));
bi_split_def(b, &intr->def);
}
@ -4146,13 +4165,13 @@ va_count_stats(bi_context *ctx, unsigned nr_ins, unsigned size,
}
static unsigned
va_gather_stats_block(bi_block *block, struct va_stats *counts)
va_gather_stats_block(bi_block *block, unsigned arch, struct va_stats *counts)
{
unsigned nr_ins = 0;
bi_foreach_instr_in_block(block, I) {
nr_ins++;
va_count_instr_stats(I, counts);
va_count_instr_stats(I, arch, counts);
}
return nr_ins;
}
@ -4161,7 +4180,8 @@ va_gather_stats_block(bi_block *block, struct va_stats *counts)
* Gather stats for a minimum length path through the shader.
*/
static unsigned
va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
va_gather_min_path_stats(bi_block *block, unsigned arch,
struct va_stats *counts)
{
struct va_stats min_counts;
struct va_stats save_counts = *counts;
@ -4173,7 +4193,7 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
if (bi_block_dominates(next, block)) {
continue;
}
nr_ins = va_gather_min_path_stats(next, counts);
nr_ins = va_gather_min_path_stats(next, arch, counts);
if (min_ins == 0 || nr_ins < min_ins) {
min_ins = nr_ins;
min_counts = *counts;
@ -4183,7 +4203,7 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
if (min_ins != 0) {
*counts = min_counts;
}
nr_ins = min_ins + va_gather_stats_block(block, counts);
nr_ins = min_ins + va_gather_stats_block(block, arch, counts);
return nr_ins;
}
@ -4194,7 +4214,8 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
* bail out.
*/
static unsigned
va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *visited)
va_gather_max_path_stats(bi_block *block, unsigned arch,
struct va_stats *counts, BITSET_WORD *visited)
{
struct va_stats max_counts;
struct va_stats save_counts = *counts;
@ -4207,7 +4228,7 @@ va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *
if (BITSET_TEST(visited, next->index)) {
continue;
}
nr_ins = va_gather_max_path_stats(next, counts, visited);
nr_ins = va_gather_max_path_stats(next, arch, counts, visited);
if (nr_ins > max_ins) {
max_ins = nr_ins;
max_counts = *counts;
@ -4217,7 +4238,7 @@ va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *
if (max_ins != 0) {
*counts = max_counts;
}
nr_ins = max_ins + va_gather_stats_block(block, counts);
nr_ins = max_ins + va_gather_stats_block(block, arch, counts);
return nr_ins;
}
@ -4241,15 +4262,16 @@ va_gather_stats(bi_context *ctx, unsigned size, struct valhall_stats *out,
case GATHER_STATS_FULL:
bi_foreach_instr_global(ctx, I) {
nr_ins++;
va_count_instr_stats(I, &counts);
va_count_instr_stats(I, ctx->arch, &counts);
}
break;
case GATHER_STATS_MIN:
nr_ins = va_gather_min_path_stats(first_block, &counts);
nr_ins = va_gather_min_path_stats(first_block, ctx->arch, &counts);
break;
case GATHER_STATS_MAX:
visited = BITSET_RZALLOC(NULL, ctx->num_blocks);
nr_ins = va_gather_max_path_stats(first_block, &counts, visited);
nr_ins =
va_gather_max_path_stats(first_block, ctx->arch, &counts, visited);
ralloc_free(visited);
break;
}
@ -4509,7 +4531,7 @@ bi_compile_variant_nir(nir_shader *nir,
va_lower_constants(ctx, I, const_hist, min_count_for_fau);
bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
va_repair_fau(&b, I);
va_repair_fau(&b, I, ctx->arch);
}
_mesa_hash_table_u64_destroy(const_hist);
@ -4611,7 +4633,7 @@ bi_compile_variant_nir(nir_shader *nir,
bifrost_debug & BIFROST_DBG_VERBOSE);
} else {
disassemble_valhall(stderr, binary->data + offset,
binary->size - offset,
binary->size - offset, ctx->arch,
bifrost_debug & BIFROST_DBG_VERBOSE);
}
@ -4679,7 +4701,7 @@ bi_compile_variant(nir_shader *nir,
uint64_t preload = first_block->reg_live_in;
/* If multisampling is used with a blend shader, the blend shader needs
* to access the sample coverage mask in r60 and the sample ID in r61.
* to access the sample coverage mask and the sample ID.
* Blend shaders run in the same context as fragment shaders, so if a
* blend shader could run, we need to preload these registers
* conservatively. There is believed to be little cost to doing so, so
@ -4690,7 +4712,10 @@ bi_compile_variant(nir_shader *nir,
* driver. We could unify the paths if the cost is acceptable.
*/
if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
preload |=
BITFIELD64_BIT(
bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch)) |
BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_SAMPLE_ID, ctx->arch));
info->ubo_mask |= ctx->ubo_mask;
info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);

View file

@ -48,7 +48,8 @@ disassemble(const char *filename)
}
if (pan_arch(gpu_id) >= 9)
disassemble_valhall(stdout, entrypoint, filesize, verbose);
disassemble_valhall(stdout, entrypoint, filesize, pan_arch(gpu_id),
verbose);
else
disassemble_bifrost(stdout, entrypoint, filesize, verbose);

View file

@ -1162,25 +1162,25 @@ bi_preload_reg(enum bi_preload val, unsigned arch)
/* Compute */
case BI_PRELOAD_LOCAL_ID_0:
/* Bits [15;0] */
return 55;
return (arch >= 15) ? 4 : 55;
case BI_PRELOAD_LOCAL_ID_1:
/* Bits [31;16] */
return 55;
return (arch >= 15) ? 4 : 55;
case BI_PRELOAD_LOCAL_ID_2:
/* Bits [15;0] */
return 56;
return (arch >= 15) ? 3 : 56;
case BI_PRELOAD_WORKGROUP_ID_0:
return 57;
return (arch >= 15) ? 5 : 57;
case BI_PRELOAD_WORKGROUP_ID_1:
return 58;
return (arch >= 15) ? 6 : 58;
case BI_PRELOAD_WORKGROUP_ID_2:
return 59;
return (arch >= 15) ? 7 : 59;
case BI_PRELOAD_GLOBAL_ID_0:
return 60;
return (arch >= 15) ? 0 : 60;
case BI_PRELOAD_GLOBAL_ID_1:
return 61;
return (arch >= 15) ? 1 : 61;
case BI_PRELOAD_GLOBAL_ID_2:
return 62;
return (arch >= 15) ? 2 : 62;
/* Vertex */
case BI_PRELOAD_POS_RESULT_PTR_LO:
assert(arch < 9);
@ -1190,58 +1190,58 @@ bi_preload_reg(enum bi_preload val, unsigned arch)
return 59;
case BI_PRELOAD_INTERNAL_ID:
assert(arch >= 9);
return 59;
return (arch >= 15) ? 2 : 59;
case BI_PRELOAD_VERTEX_ID:
return (arch >= 9) ? 60 : 61;
return (arch >= 15) ? 0 : (arch >= 9) ? 60 : 61;
case BI_PRELOAD_INSTANCE_ID:
return (arch >= 9) ? 61 : 62;
return (arch >= 15) ? 1 : (arch >= 9) ? 61 : 62;
case BI_PRELOAD_DRAW_ID:
assert(arch >= 9);
return 62;
return (arch >= 15) ? 3 : 62;
case BI_PRELOAD_VIEW_ID:
assert(arch >= 9);
return 63;
return (arch >= 15) ? 4 : 63;
/* Fragment */
case BI_PRELOAD_PRIMITIVE_ID:
return 57;
return (arch >= 15) ? 6 : 57;
case BI_PRELOAD_PRIMITIVE_FLAGS:
return 58;
return (arch >= 15) ? 3 : 58;
case BI_PRELOAD_POSITION_XY:
return 59;
return (arch >= 15) ? 2 : 59;
case BI_PRELOAD_CUMULATIVE_COVERAGE:
/* Bits [15;0] */
return 60;
return (arch >= 15) ? 0 : 60;
case BI_PRELOAD_RASTERIZER_COVERAGE:
/* Bits [15;0] */
return 61;
return (arch >= 15) ? 1 : 61;
case BI_PRELOAD_SAMPLE_ID:
/* Bits [23;16] */
return 61;
return (arch >= 15) ? 0 : 61;
case BI_PRELOAD_CENTROID_ID:
/* Bits [31;24] */
return 61;
return (arch >= 15) ? 0 : 61;
case BI_PRELOAD_FRAME_ARG:
/* Double reg */
return 62;
return (arch >= 15) ? 4 : 62;
/* Blend */
case BI_PRELOAD_BLEND_SRC0_C0:
return 0;
return (arch >= 15) ? 8 : 0;
case BI_PRELOAD_BLEND_SRC0_C1:
return 1;
return (arch >= 15) ? 9 : 1;
case BI_PRELOAD_BLEND_SRC0_C2:
return 2;
return (arch >= 15) ? 10 : 2;
case BI_PRELOAD_BLEND_SRC0_C3:
return 3;
return (arch >= 15) ? 11 : 3;
case BI_PRELOAD_BLEND_SRC1_C0:
return 4;
return (arch >= 15) ? 12 : 4;
case BI_PRELOAD_BLEND_SRC1_C1:
return 5;
return (arch >= 15) ? 13 : 5;
case BI_PRELOAD_BLEND_SRC1_C2:
return 6;
return (arch >= 15) ? 14 : 6;
case BI_PRELOAD_BLEND_SRC1_C3:
return 7;
return (arch >= 15) ? 15 : 7;
case BI_PRELOAD_BLEND_LINK:
return 48;
return (arch >= 15) ? 7 : 48;
}
UNREACHABLE("Non-handled BI_PRELOAD");
}

File diff suppressed because it is too large Load diff

View file

@ -29,16 +29,20 @@ class FAUState:
die_if(self.page is not None and self.page != page, 'Mismatched pages')
self.page = page
def push(self, source):
if not (source & (1 << 7)):
# Skip registers
def push(self, source, arch):
# Skip registers
if arch >= 15 and not (source & (1 << 8)):
return
elif arch < 15 and not (source & (1 << 7)):
return
self.buffer.add(source)
die_if(len(self.buffer) > 2, "Overflowed FAU buffer")
if (source >> 5) == 0b110:
# Small constants need to check if the buffer overflows but no else
# Small constants need to check if the buffer overflows but no else
if arch >= 15 and (source >> 5) == 0b1110:
return
elif arch < 15 and (source >> 5) == 0b110:
return
slot = (source >> 1)
@ -120,6 +124,50 @@ def encode_source(op, fau):
die('Invalid operand')
def encode_source_v15(op, fau):
# Reg tuple
if op[0] == '[' and op[-1:] == ']':
# Remove brackets and split on ":"
unpacked = op[1:-1].split(":")
die_if(len(unpacked) != 2, 'Invalid tuple')
die_if(unpacked[0][0] != 'r', 'Invalid tuple')
die_if(unpacked[1][0] != 'r', 'Invalid tuple')
if (unpacked[0][-1:] == '^'):
val0 = parse_int(unpacked[0][1:-1], 0, 127)
val1 = parse_int(unpacked[1][1:-1], 0, 127)
die_if(val1 != val0 + 1, 'Invalid tuple value')
return val0 | 0x80
else:
val0 = parse_int(unpacked[0][1:], 0, 127)
val1 = parse_int(unpacked[1][1:], 0, 127)
die_if(val1 != val0 + 1, 'Invalid tuple value')
return val0
elif op[0] == 'r':
if (op[-1:] == '^'):
return parse_int(op[1:-1], 0, 127) | 0x80
return parse_int(op[1:], 0, 127)
elif op[0] == 'u':
val = parse_int(op[1:], 0, 254)
fau.set_page(val >> 6)
return ((val & 0x3F) << 1) | 0x100
elif op[0] == 'i':
return int(op[3:]) | 0x1C0
elif op.startswith('0x'):
try:
val = int(op, base=0)
except ValueError:
die('Expected value')
die_if(val not in immediates, 'Unexpected immediate value')
return immediates.index(val) | 0x1C0
else:
for i in [0, 1, 3]:
if op in enums[f'fau_special_page_{i}'].bare_values:
idx = 32 + (enums[f'fau_special_page_{i}'].bare_values.index(op) << 1)
fau.set_page(i)
return idx | 0x1E0
die('Invalid operand')
def encode_dest(op):
# Reg tuple
@ -156,7 +204,47 @@ def encode_dest(op):
return value | (wrmask << 6)
def parse_asm(line):
def encode_dest_v15(op, dst64):
# Reg tuple
if op[0] == '[' and op[-1:] == ']':
# Remove brackets and split on ":"
unpacked = op[1:-1].split(":")
die_if(len(unpacked) != 2, 'Invalid tuple')
die_if(unpacked[0][0] != 'r', 'Invalid tuple')
die_if(unpacked[1][0] != 'r', 'Invalid tuple')
parts = unpacked[0].split(".")
reg = parts[0]
value = parse_int(reg[1:], 0, 127)
parts1 = unpacked[1].split(".")
reg1 = parts1[0]
val1 = parse_int(reg1[1:], 0, 127)
die_if(val1 != value + 1, 'Invalid tuple value')
else:
die_if(op[0] != 'r', f"Expected register destination {op}")
parts = op.split(".")
reg = parts[0]
value = parse_int(reg[1:], 0, 127)
# Default to writing in full
if (dst64):
wrmask = 0x0
die_if(len(parts) > 1, "Must write full")
else:
wrmask = 0x3
if len(parts) > 1:
WMASKS = ["h0", "h1"]
die_if(len(parts) > 2, "Too many modifiers")
mask = parts[1];
die_if(mask not in WMASKS, "Expected a write mask")
wrmask = 1 << WMASKS.index(mask)
return value | (wrmask << 13)
def parse_asm(line, arch):
global LINE
LINE = line # For better errors
encoded = 0
@ -187,7 +275,7 @@ def parse_asm(line):
tail = line[(len(head) + 1):]
operands = [x.strip() for x in tail.split(",") if len(x.strip()) > 0]
expected_op_count = len(ins.srcs) + len(ins.dests) + len(ins.immediates) + len(ins.staging)
expected_op_count = len(ins.srcs) + len(ins.dests) + len((ins.immediates_v15 if arch >= 15 else ins.immediates)) + len(ins.staging)
if len(operands) != expected_op_count:
die(f"Wrong number of operands in {line}, expected {expected_op_count}, got {len(operands)} {operands}")
@ -200,9 +288,9 @@ def parse_asm(line):
parts = []
die_if(any([x[0] != 'r' for x in parts]), f'Expected registers, got {op}')
regs = [parse_int(x[1:], 0, 63) for x in parts]
regs = [parse_int(x[1:], 0, (127 if arch >= 15 else 63)) for x in parts]
extended_write = "staging_register_write_count" in [x.name for x in ins.modifiers] and sr.write
extended_write = "staging_register_write_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)] and sr.write
max_sr_count = 8 if extended_write else 7
sr_count = len(regs)
@ -215,22 +303,31 @@ def parse_asm(line):
'Consecutive staging registers must be aligned to a register pair')
if sr.count == 0:
if "staging_register_write_count" in [x.name for x in ins.modifiers] and sr.write:
if "staging_register_write_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)] and sr.write:
modifier_map["staging_register_write_count"] = sr_count - 1
else:
assert "staging_register_count" in [x.name for x in ins.modifiers]
assert "staging_register_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)]
modifier_map["staging_register_count"] = sr_count
else:
die_if(sr_count != sr.count, f"Expected {sr.count} staging registers, got {sr_count}")
encoded |= ((sr.encoded_flags | base) << sr.start)
encoded |= base << sr.start
if arch >= 15:
encoded |= sr.encoded_flags_v15 << sr.offset['flags_v15']
else:
encoded |= sr.encoded_flags << sr.offset['flags']
# On v15, some instructions require special sr_control values
if arch >= 15 and ins.name == "BARRIER":
encoded |= 0b10 << 38
operands = operands[len(ins.staging):]
for op, dest in zip(operands, ins.dests):
encoded |= encode_dest(op) << 40
encoded |= (encode_dest_v15(op, dest.size >= 64) if arch >= 15 else encode_dest(op)) << 40
operands = operands[len(ins.dests):]
if len(ins.dests) == 0 and len(ins.staging) == 0:
if arch < 15 and len(ins.dests) == 0 and len(ins.staging) == 0:
# Set a placeholder writemask to prevent encoding faults
encoded |= (0xC0 << 40)
@ -238,12 +335,18 @@ def parse_asm(line):
for i, (op, src) in enumerate(zip(operands, ins.srcs)):
parts = op.split('.')
encoded_src = encode_source(parts[0], fau)
# Require a word selection for special FAU values
may_have_word_select = ((encoded_src >> 5) == 0b111)
# or for regular FAU values
may_have_word_select |= ((encoded_src >> 6) == 0b10)
if (arch >= 15):
encoded_src = encode_source_v15(parts[0], fau)
# Require a word selection for special FAU values
may_have_word_select = ((encoded_src >> 5) == 0b1111)
# or for regular FAU values
may_have_word_select |= ((encoded_src >> 7) == 0b10)
else:
encoded_src = encode_source(parts[0], fau)
# Require a word selection for special FAU values
may_have_word_select = ((encoded_src >> 5) == 0b111)
# or for regular FAU values
may_have_word_select |= ((encoded_src >> 6) == 0b10)
# Has a swizzle been applied yet?
swizzled = False
@ -251,7 +354,11 @@ def parse_asm(line):
for mod in parts[1:]:
# Encode the modifier
if mod in src.offset and src.mask[mod] == 0x1:
encoded |= (1 << src.offset[mod])
# On v15, FMA_RSCALE has a different offset src2.neg
if arch >= 15 and ins.name[:10] == "FMA_RSCALE" and mod == "neg" and i == 2:
encoded |= (1 << (src.offset[mod] + 1))
else:
encoded |= (1 << src.offset[mod])
elif src.halfswizzle and mod in enums[f'half_swizzles_{src.size}_bit'].bare_values:
die_if(swizzled, "Multiple swizzles specified")
swizzled = True
@ -318,12 +425,15 @@ def parse_asm(line):
val = enums['swizzles_16_bit'].bare_values.index(mod)
encoded |= (val << src.offset['widen'])
encoded |= encoded_src << src.start
fau.push(encoded_src)
if arch >= 15:
encoded |= ((encoded_src & 0x100) << (src.offset['high1_v15'] - 8)) | ((encoded_src & 0xFF) << src.start)
else:
encoded |= encoded_src << src.start
fau.push(encoded_src, arch)
operands = operands[len(ins.srcs):]
for i, (op, imm) in enumerate(zip(operands, ins.immediates)):
for i, (op, imm) in enumerate(zip(operands, (ins.immediates_v15 if arch >= 15 else ins.immediates))):
if op[0] == '#':
die_if(imm.name != 'constant', "Wrong syntax for immediate")
parts = [imm.name, op[1:]]
@ -347,15 +457,15 @@ def parse_asm(line):
encoded |= (val << imm.start)
operands = operands[len(ins.immediates):]
operands = operands[len((ins.immediates_v15 if arch >= 15 else ins.immediates)):]
# Encode the operation itself
for subcode in ins.opcode:
for subcode in (ins.opcode_v15 if arch >= 15 else ins.opcode):
encoded |= (subcode.value << subcode.start)
# Encode FAU page
if fau.page:
encoded |= (fau.page << ins.offset['fau_page'])
encoded |= (fau.page << (ins.offset['fau_page_v15'] if arch >= 15 else ins.offset['fau_page']))
# Encode modifiers
has_flow = False
@ -366,9 +476,10 @@ def parse_asm(line):
if mod in enums['flow'].bare_values:
die_if(has_flow, "Multiple flow control modifiers specified")
has_flow = True
encoded |= (enums['flow'].bare_values.index(mod) << ins.offset['flow'])
encoded |= (enums['flow'].bare_values.index(mod) << (ins.offset['flow_v15'] if arch >= 15 else
ins.offset['flow']))
else:
candidates = [c for c in ins.modifiers if mod in c.bare_values]
candidates = [c for c in (ins.modifiers_v15 if arch >= 15 else ins.modifiers) if mod in c.bare_values]
die_if(len(candidates) == 0, f"Invalid modifier {mod} used")
assert(len(candidates) == 1) # No ambiguous modifiers
@ -380,13 +491,20 @@ def parse_asm(line):
die_if(opts.name in modifier_map, f"{opts.name} specified twice")
modifier_map[opts.name] = value
for mod in ins.modifiers:
for mod in (ins.modifiers_v15 if arch >= 15 else ins.modifiers):
value = modifier_map.get(mod.name, mod.default)
die_if(value is None, f"Missing required modifier {mod.name}")
assert(value < (1 << mod.size))
encoded |= (value << mod.start)
# On v15, some instrutions require an encoded null src.
requires_nullsrc = ['BARRIER', 'NOP', 'LD_GCLK_U64', 'LD_VAR_FLAT_IMM', 'LD_VAR_BUF_FLAT_IMM'];
if arch >= 15 and ins.name in requires_nullsrc:
enc_src = 0x1C0
encoded |= ((enc_src >> 8) & 0x1) << 48 | (enc_src & 0xFF)
return encoded
if __name__ == "__main__":

View file

@ -28,6 +28,10 @@ template = """
#define VA_SRC_UNIFORM_TYPE 0x2
#define VA_SRC_IMM_TYPE 0x3
#define VA_SRC_V15_MODE1 BIT(8)
#define VA_SRC_V15_MODE2 BIT(7)
#define VA_SRC_V15_MODE4 BIT(5)
% for name, en in ENUMS.items():
UNUSED static const char *valhall_${name}[] = {
% for v in en.values:
@ -91,22 +95,84 @@ va_print_float_src(FILE *fp, unsigned type, unsigned value, unsigned size, unsig
fprintf(fp, ".abs");
}
static inline void
va_print_src_v15(FILE *fp, unsigned high1, unsigned low8, unsigned size, unsigned fau_page)
{
unsigned src = (high1 << 8) | low8;
/* Not reg */
if (src & VA_SRC_V15_MODE1) {
/* Not uniform */
if (src & VA_SRC_V15_MODE2) {
/* FAU special */
if (src & VA_SRC_V15_MODE4) {
unsigned value = src & MASK(5);
if (fau_page == 0)
fputs(valhall_fau_special_page_0[value >> 1] + 1, fp);
else if (fau_page == 1)
fputs(valhall_fau_special_page_1[value >> 1] + 1, fp);
else if (fau_page == 3)
fputs(valhall_fau_special_page_3[value >> 1] + 1, fp);
else
fprintf(fp, "reserved_page2");
fprintf(fp, ".w%u", value & 1);
}
/* Imm */
else {
unsigned value = src & MASK(5);
assert(value < 32 && "overflow in LUT");
fprintf(fp, "0x%X", va_immediates[value]);
}
}
/* Uniform */
else {
unsigned value = src & MASK(7);
fprintf(fp, "u%u", value >> 1 | (fau_page << 6));
if (size <= 32)
fprintf(fp, ".w%u", value & 1);
}
}
/* Reg */
else {
unsigned value = src & MASK(7);
bool discard = (src & BIT(7));
char *dmark = discard ? "^" : "";
if (size > 32)
fprintf(fp, "[r%u%s:r%u%s]", value, dmark, value + 1, dmark);
else
fprintf(fp, "r%u%s", value, dmark);
}
}
static inline void
va_print_float_src_v15(FILE *fp, unsigned high1, unsigned low8, unsigned size, unsigned fau_page, bool neg, bool abs)
{
va_print_src_v15(fp, high1, low8, size, fau_page);
if (neg)
fprintf(fp, ".neg");
if (abs)
fprintf(fp, ".abs");
}
static inline void
va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
{
if (size > 32)
fprintf(fp, "[r%u:r%u]", value, value + 1);
else
else {
fprintf(fp, "r%u", value);
if (mask != 0x3)
fprintf(fp, ".h%u", (mask == 1) ? 0 : 1);
if (mask != 0x3)
fprintf(fp, ".h%u", (mask == 1) ? 0 : 1);
}
}
<%def name="print_instr(op)">
<%def name="print_instr(op, v15)">
<% no_comma = True %>
fputs("${op.name}", fp);
% for mod in op.modifiers:
% for mod in (op.modifiers_v15 if v15 else op.modifiers):
% if mod.name not in ["staging_register_count", "staging_register_write_count"]:
% if mod.is_enum:
fputs(valhall_${safe_name(mod.enum)}[(instr >> ${mod.start}) & ${hex((1 << mod.size) - 1)}], fp);
@ -115,10 +181,18 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
% endif
% endif
% endfor
% if v15:
fprintf(fp, "%s ", valhall_flow[(instr >> ${op.offset['flow_v15']}) & ${hex(op.mask['flow_v15'])}]);
% else:
fprintf(fp, "%s ", valhall_flow[(instr >> ${op.offset['flow']}) & ${hex(op.mask['flow'])}]);
% endif
% for i, dest in enumerate(op.dests):
<% no_comma = False %>
% if v15:
va_print_dest(fp, (instr >> ${dest.offset['mode_v15']}) & ${hex(dest.mask['mode_v15'])}, (instr >> ${dest.offset['value_v15']}) & ${hex(dest.mask['value_v15'])}, ${dest.size});
% else:
va_print_dest(fp, (instr >> ${dest.offset['mode']}) & ${hex(dest.mask['mode'])}, (instr >> ${dest.offset['value']}) & ${hex(dest.mask['value'])}, ${dest.size});
% endif
% endfor
% for index, sr in enumerate(op.staging):
% if not no_comma:
@ -130,13 +204,12 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
if sr.count != 0:
sr_count = sr.count;
else:
for mod in op.modifiers:
for mod in (op.modifiers_v15 if v15 else op.modifiers):
if mod.name == "staging_register_write_count" and sr.write:
sr_count = f"(((instr >> {mod.start}) & {hex((1 << mod.size) - 1)}) + 1)";
elif mod.name == "staging_register_count":
sr_count = f"((instr >> {mod.start}) & {hex((1 << mod.size) - 1)})";
%>
// assert(((instr >> ${sr.start}) & 0xC0) == ${sr.encoded_flags});
fprintf(fp, "@");
for (unsigned i = 0; i < ${sr_count}; ++i) {
fprintf(fp, "%sr%u", (i == 0) ? "" : ":",
@ -148,6 +221,28 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
fputs(", ", fp);
% endif
<% no_comma = False %>
% if v15:
% if src.absneg:
va_print_float_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${hex(src.mask['high1_v15'])}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])},
${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])},
% if op.name[:4] == "FMA." and i == 0:
false,
instr & BIT(${src.offset['abs']}));
% elif op.name[:10] == "FMA_RSCALE" and i == 2:
instr & BIT(${src.offset['neg'] + 1}),
false);
% else:
instr & BIT(${src.offset['neg']}),
instr & BIT(${src.offset['abs']}));
% endif
% elif src.is_float:
va_print_float_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${src.mask['high1_v15']}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])},
${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])}, false, false);
% else:
va_print_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${src.mask['high1_v15']}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])},
${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])});
% endif
% else:
% if src.absneg:
va_print_float_src(fp, (instr >> ${src.offset['mode']}) & ${hex(src.mask['mode'])}, (instr >> ${src.offset['value']}) & ${hex(src.mask['value'])},
${src.size}, (instr >> ${op.offset['fau_page']}) & ${hex(op.mask['fau_page'])},
@ -160,6 +255,7 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
va_print_src(fp, (instr >> ${src.offset['mode']}) & ${src.mask['mode']}, (instr >> ${src.offset['value']}) & ${hex(src.mask['value'])},
${src.size}, (instr >> ${op.offset['fau_page']}) & ${hex(op.mask['fau_page'])});
% endif
% endif
% if src.swizzle:
% if src.size == 32:
fputs(valhall_widen[(instr >> ${src.offset['swizzle']}) & ${hex(src.mask['swizzle'])}], fp);
@ -183,7 +279,7 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
if (instr & BIT(${src.offset['not']})) fputs(".not", fp);
% endif
% endfor
% for imm in op.immediates:
% for imm in (op.immediates_v15 if v15 else op.immediates):
<%
prefix = "#" if imm.name == "constant" else imm.name + ":"
fmt = "%d" if imm.signed else "0x%X"
@ -192,16 +288,16 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
% endfor
</%def>
<%def name="recurse_subcodes(op_bucket)">
<%def name="recurse_subcodes(op_bucket, v15)">
%if op_bucket.instr:
${print_instr(op_bucket.instr)}
${print_instr(op_bucket.instr, v15)}
%else:
opcode = (instr >> ${op_bucket.start}) & ${hex(op_bucket.mask)};
switch (opcode) {
%for op in op_bucket.children:
case ${hex(op)}:
{
${recurse_subcodes(op_bucket.children[op])}
${recurse_subcodes(op_bucket.children[op], v15)}
break;
}
%endfor
@ -215,7 +311,15 @@ va_disasm_instr(FILE *fp, uint64_t instr)
{
unsigned opcode;
${recurse_subcodes(OPCODES)}
${recurse_subcodes(OPCODES, False)}
}
void
va_disasm_instr_v15(FILE *fp, uint64_t instr)
{
unsigned opcode;
${recurse_subcodes(OPCODES_V15, True)}
}
static bool is_branch(uint64_t instr)
@ -229,8 +333,19 @@ static bool is_branch(uint64_t instr)
return false;
}
static bool is_branch_v15(uint64_t instr)
{
<% (exact, mask) = OPCODES_V15.get_exact_mask("BRANCHZ") %>
if ((instr & ${hex(mask)}) == ${hex(exact)})
return true;
<% (exact, mask) = OPCODES_V15.get_exact_mask("BRANCHZI") %>
if ((instr & ${hex(mask)}) == ${hex(exact)})
return true;
return false;
}
void
disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose)
disassemble_valhall(FILE *fp, const void *code, size_t size, unsigned arch, bool verbose)
{
assert((size & 7) == 0);
@ -256,11 +371,18 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose)
fprintf(fp, " ");
}
va_disasm_instr(fp, instr);
bool instr_is_branch;
if (arch >= 15) {
va_disasm_instr_v15(fp, instr);
instr_is_branch = is_branch_v15(instr);
} else {
va_disasm_instr(fp, instr);
instr_is_branch = is_branch(instr);
}
fprintf(fp, "\\n");
/* Separate blocks visually by inserting whitespace after branches */
if (is_branch(instr))
if (instr_is_branch)
fprintf(fp, "\\n");
}
@ -276,6 +398,9 @@ class OpBucket:
self.children = {}
def insert(self, subcodes, ins):
# Need an early return in case of removed instructions
if subcodes is None:
return
if len(subcodes) == 0:
self.instr = ins
else:
@ -305,10 +430,12 @@ class OpBucket:
# Build opcode hierarchy:
OPCODES = OpBucket()
OPCODES_V15 = OpBucket()
for ins in instructions:
OPCODES.insert(ins.opcode, ins)
OPCODES_V15.insert(ins.opcode_v15, ins)
try:
print(Template(template).render(OPCODES = OPCODES, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name))
print(Template(template).render(OPCODES = OPCODES, OPCODES_V15 = OPCODES_V15, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name))
except:
print(exceptions.text_error_template().render())

View file

@ -15,6 +15,8 @@
#include <string.h>
void va_disasm_instr(FILE *fp, uint64_t instr);
void disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose);
void va_disasm_instr_v15(FILE *fp, uint64_t instr);
void disassemble_valhall(FILE *fp, const void *code, size_t size, unsigned arch,
bool verbose);
#endif

View file

@ -44,9 +44,7 @@ libpanfrost_valhall_disasm = static_library(
)
if with_tests
test(
'valhall_disasm',
executable(
valhall_disasm_test_e = executable(
'valhall_disasm_test',
files('test/test-disassembler.c'),
c_args : [c_msvc_compat_args, no_override_init_args],
@ -54,15 +52,33 @@ if with_tests
include_directories : [inc_include, inc_src],
dependencies: [idep_valhall_enums_h],
link_with : [libpanfrost_valhall_disasm],
),
)
test(
'valhall_disasm',
valhall_disasm_test_e,
suite : ['panfrost'],
args : files('test/assembler-cases.txt'),
args : [files('test/assembler-cases.txt'), 'v10'],
)
test(
'valhall_disasm',
valhall_disasm_test_e,
suite : ['panfrost'],
args : [files('test/assembler-cases-v15.txt'), 'v15'],
)
test(
'valhall_asm',
prog_python,
args : files('test-assembly.py', 'test/assembler-cases.txt', 'test/negative-cases.txt'),
args : [files('test-assembly.py', 'test/assembler-cases.txt', 'test/negative-cases.txt'), 'v10'],
suite : ['panfrost'],
)
test(
'valhall_asm',
prog_python,
args : [files('test-assembly.py', 'test/assembler-cases-v15.txt', 'test/negative-cases.txt'), 'v15'],
suite : ['panfrost'],
)
endif

View file

@ -17,19 +17,19 @@ def hex_8(u64):
return ' '.join(as_strings)
# These should not throw exceptions
def positive_test(machine, assembly):
def positive_test(machine, assembly, arch):
try:
expected = parse_hex_8(machine)
val = parse_asm(assembly)
val = parse_asm(assembly, arch)
if val != expected:
return f"{hex_8(val)} Incorrect assembly"
except ParseError as exc:
return f"Unexpected exception: {exc}"
# These should throw exceptions
def negative_test(assembly):
def negative_test(assembly, arch):
try:
parse_asm(assembly)
parse_asm(assembly, arch)
return "Expected exception"
except Exception:
return None
@ -43,24 +43,34 @@ def record_case(case, error):
else:
FAIL.append((case, error))
if len(sys.argv) < 3:
print("Expected positive and negative case lists")
if len(sys.argv) < 4:
print("Expected positive and negative case lists, followed by arch")
sys.exit(1)
if sys.argv[3][0] == 'v':
try:
arch = int(sys.argv[3][1:], base = 0)
except ValueError:
print(f"Expected arch number {sys.argv[3][1:]}")
sys.exit(1)
else:
print(f"Expected arch version {sys.argv[3]}")
with open(sys.argv[1], "r") as f:
cases = f.read().split('\n')
cases = [x for x in cases if len(x) > 0 and x[0] != '#']
for case in cases:
(machine, assembly) = case.split(' ')
record_case(case, positive_test(machine, assembly))
record_case(case, positive_test(machine, assembly, arch))
with open(sys.argv[2], "r") as f:
cases = f.read().split('\n')
cases = [x for x in cases if len(x) > 0]
for case in cases:
record_case(case, negative_test(case))
record_case(case, negative_test(case, arch))
print("Passed {}/{} tests.".format(len(PASS), len(PASS) + len(FAIL)))

View file

@ -0,0 +1,195 @@
02 00 20 00 00 01 60 00 MOV.i32 r1, r2
0a 00 20 00 00 01 61 00 MOV.i32 r1, u5.w0
e3 00 20 00 00 01 61 40 MOV.i32 r1, thread_local_pointer.w1
e6 00 20 00 00 01 61 40 MOV.i32 r1, workgroup_local_pointer.w0
e2 00 20 00 00 01 61 c0 MOV.i32 r1, lane_id.w0
e6 00 20 00 00 01 61 c0 MOV.i32 r1, core_id.w0
01 02 00 00 00 00 f0 00 FADD.f32 r0, r1, r2
01 02 00 00 20 00 f0 00 FADD.f32 r0, r1, r2.abs
01 02 00 00 10 00 f0 00 FADD.f32 r0, r1, r2.neg
01 02 00 00 30 00 f0 00 FADD.f32 r0, r1, r2.neg.abs
01 02 00 80 30 00 f0 00 FADD.f32.clamp_m1_1 r0, r1, r2.neg.abs
81 03 00 00 00 00 b8 2a BRANCHZ.reconverge r1^, offset:3
01 d0 00 00 00 00 f2 00 FADD.f32 r0, r1, 0x3F800000
01 d0 00 00 10 00 f2 00 FADD.f32 r0, r1, 0x3F800000.neg
01 c0 00 00 00 00 f2 00 FADD.f32 r0, r1, 0x0
01 c0 00 00 10 00 f2 00 FADD.f32 r0, r1, 0x0.neg
01 c9 00 00 00 00 e2 00 IADD.u32 r0, r1, 0x7060504
01 00 00 08 00 00 f0 00 FADD.f32 r0, r1, r0.h1
01 00 00 04 00 00 f0 00 FADD.f32 r0, r1, r0.h0
01 00 00 0c 00 00 f4 00 FADD.v2f16 r0, r1.h00, r0.h11
01 00 00 28 00 00 f4 00 FADD.v2f16 r0, r1, r0
01 00 00 24 00 00 f4 00 FADD.v2f16 r0, r1, r0.h10
01 02 00 08 00 00 e0 00 IADD.u32 r0, r1, r2.h0
01 02 00 0c 00 00 e0 00 IADD.u32 r0, r1, r2.h1
01 02 00 0c 70 00 e0 00 IADD.u32 r0, r1.b3, r2.h1
01 c9 00 18 00 00 e2 00 IADD.u32 r0, r1, 0x7060504.b2
01 02 00 08 20 00 e4 00 IADD.v2u16 r0, r1, r2
02 3c 47 20 00 00 91 02 SHADDX.u64 [r0:r1], u1, [r60:r61].w0, shift:0x2
80 00 00 00 19 00 20 07 LOAD.i32.slot0.wait0 @r0, [r0^:r1^], offset:0
00 bc 87 20 00 00 91 02 SHADDX.u64 [r0:r1], u0, [r60^:r61^].w0, shift:0x4
80 00 00 00 9c 04 20 3f STORE.i128.slot0.end @r4:r5:r6:r7, [r0^:r1^], offset:0
c0 00 e0 01 00 00 a1 3e NOP.end
80 c4 c0 1e 02 01 e6 01 ICMP_OR.u32.gt.m1 r1, r0^, 0x1000000.b3, 0x0
82 00 00 00 99 00 20 2b STORE.i32.slot0.reconverge @r0, [r2^:r3^], offset:0
00 c9 8f 12 30 00 e2 00 CLPER.i32.f1 r0, r0, 0x7060504.b00
00 00 4b 00 00 02 60 00 F16_TO_F32 r2, r0.h0
80 00 4b 10 00 03 60 00 F16_TO_F32 r3, r0^.h1
c0 00 e0 01 00 00 a1 22 NOP.wait0126
80 c0 00 28 90 00 f6 24 FADD.v2f16.wait r0, r0^.abs, 0x0.neg
c0 00 00 00 00 36 6d 00 IADD_IMM.i32 r54, 0x0, #0x0
3c d0 ea 00 01 3c d6 37 ATEST.discard @r60, r60, 0x3F800000, atest_datum.w0
80 db 05 04 00 01 e6 00 MKVEC.v2i16 r1, r0^.h0, 0x3C000000.h1
f0 00 3c 33 82 00 1b 3f BLEND.slot0.v4.f16.end @r0:r1, blend_descriptor_0.w0, r60, target:0x0
bb 0d 00 40 02 04 08 07 LEA_BUF_IMM.slot1.wait0 @r4:r5, r59^, table:0xD, index:0x0
00 dd c0 08 14 02 66 01 FMA.f32 r2, r0, 0x44000000.neg.h1, 0x0.neg
81 08 c0 00 04 01 66 01 FMA.f32 r1, r1^, u4.w0, 0x0.neg
80 08 c0 00 04 00 66 09 FMA.f32.wait1 r0, r0^, u4.w0, 0x0.neg
84 00 00 02 93 00 20 3f STORE.i96.estream.slot0.end @r0:r1:r2, [r4^:r5^], offset:0
84 00 00 01 9c 08 20 3f STORE.i128.istream.slot0.end @r8:r9:r10:r11, [r4^:r5^], offset:0
c0 00 00 c0 80 00 3d 27 BARRIER.slot7.wait
00 00 00 00 01 02 21 03 LOAD.i8.slot0 @r2, u0, offset:0
00 00 00 00 09 02 21 03 LOAD.i16.slot0 @r2, u0, offset:0
00 00 00 00 11 02 21 03 LOAD.i24.slot0 @r2, u0, offset:0
00 00 00 00 19 02 21 03 LOAD.i32.slot0 @r2, u0, offset:0
00 00 00 00 02 02 21 03 LOAD.i48.slot0 @r2:r3, u0, offset:0
00 00 00 00 0a 02 21 03 LOAD.i64.slot0 @r2:r3, u0, offset:0
00 00 00 00 13 02 21 03 LOAD.i96.slot0 @r2:r3:r4, u0, offset:0
00 00 00 00 1c 04 21 03 LOAD.i128.slot0 @r4:r5:r6:r7, u0, offset:0
00 00 00 08 01 02 21 03 LOAD.i8.b1.slot0 @r2, u0, offset:0
00 00 00 10 01 02 21 03 LOAD.i8.b2.slot0 @r2, u0, offset:0
00 00 00 18 01 02 21 03 LOAD.i8.b3.slot0 @r2, u0, offset:0
00 00 00 00 09 02 21 03 LOAD.i16.slot0 @r2, u0, offset:0
00 14 00 08 09 02 21 03 LOAD.i16.h1.slot0 @r2, u0, offset:20
82 00 4d 00 42 02 60 00 FROUND.f32.rtn r2, r2^.neg
82 00 4b 00 40 02 60 00 F16_TO_F32 r2, r2^.neg.h0
82 00 4c 00 43 02 60 00 F32_TO_S32.rtz r2, r2^.neg
82 c0 c6 47 48 02 64 00 FADD_IMM.f32 r2, r2^, #0x4847C6C0
82 84 67 ac 70 02 62 00 FADD_IMM.v2f16 r2, r2^, #0x70AC6784
82 14 00 13 00 02 6a 00 IADD_IMM.v2i16 r2, r2^, #0x130014
82 ab 4b 00 00 02 6c 00 IADD_IMM.i32 r2, r2^, #0x4BAB
83 82 c0 c6 12 02 e4 01 ICMP_OR.v2s16.gt.m1 r2, r3^.h10, r2^.h10, 0x0
83 82 c0 52 03 02 e4 01 FCMP_OR.v2f16.gt.m1 r2, r3^.h10, r2^.h00, 0x0
81 03 00 00 00 00 b8 2a BRANCHZ.reconverge r1^, offset:3
00 03 00 00 20 00 b8 2a BRANCHZ.reconverge r0.h0, offset:3
00 03 00 00 40 00 b8 2a BRANCHZ.reconverge r0.h1, offset:3
00 03 00 00 00 00 b8 2a BRANCHZ.reconverge r0, offset:3
c0 00 00 00 00 00 6d 00 IADD_IMM.i32 r0, 0x0, #0x0
c0 01 00 00 00 04 6d 28 IADD_IMM.i32.reconverge r4, 0x0, #0x1
00 00 47 20 00 02 91 02 SHADDX.u64 [r2:r3], u0, [r0:r1].w0, shift:0x2
80 c9 00 10 00 00 e2 00 IADD.u32 r0, r0^, 0x7060504.b0
00 02 c0 02 06 01 e6 01 ICMP_OR.u32.ne.m1 r1, r0, u1.w0, 0x0
04 00 20 00 00 05 60 00 MOV.i32 r5, r4
04 00 20 00 00 06 60 00 MOV.i32 r6, r4
04 00 20 00 00 07 60 04 MOV.i32.wait0 r7, r4
82 00 00 00 9c 04 20 03 STORE.i128.slot0 @r4:r5:r6:r7, [r2^:r3^], offset:0
81 f8 ff ff 07 00 b8 2a BRANCHZ.reconverge r1^, offset:-8
bd c0 00 08 10 3c c6 00 IADD.v2u16 r60.h1, r61^.h10, 0x0
84 00 86 32 8c 00 12 3f ST_CVT.slot0.istream.v4.f32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0
84 00 86 34 8c 00 12 3f ST_CVT.slot0.istream.v4.s32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0
84 00 86 36 8c 00 12 3f ST_CVT.slot0.istream.v4.u32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0
bc c0 12 00 2b 04 86 03 LEA_TEX_IMM.slot0 @r4:r5:r6, r60^, 0x0, table:0x2, index:0x1
bc c0 02 00 2b 04 86 03 LEA_TEX_IMM.slot0 @r4:r5:r6, r60^, 0x0, table:0x2, index:0x0
02 01 00 00 0a 02 8b 03 LD_PKA.i64.slot0 @r2:r3, u1.w0, u0.w1
00 01 00 40 0a 00 8b 03 LD_PKA.i64.slot1 @r0:r1, u0.w0, u0.w1
04 01 00 80 0a 26 8b 03 LD_PKA.i64.slot2 @r38:r39, u2.w0, u0.w1
03 01 00 80 0a 24 8b 03 LD_PKA.i64.slot2 @r36:r37, u1.w1, u0.w1
03 04 00 00 0a 02 8b 03 LD_PKA.i64.slot0 @r2:r3, u1.w1, u2.w0
81 02 00 00 13 02 8a 03 LD_PKA.i96.slot0 @r2:r3:r4, r1^, u1.w0
80 03 00 00 13 06 8a 07 LD_PKA.i96.slot0.wait0 @r6:r7:r8, r0^, u1.w1
80 00 80 01 c0 00 60 20 FRCP.f32.wait0126 r0, r0^.neg.abs
80 84 00 80 00 00 7c 01 MUX.i32.neg r0, r0^, r4^, u0.w0
80 84 00 80 04 00 7c 01 MUX.i32 r0, r0^, r4^, u0.w0
80 84 00 80 08 00 7c 01 MUX.i32.fp_zero r0, r0^, r4^, u0.w0
80 84 00 80 0c 00 7c 01 MUX.i32.bit r0, r0^, r4^, u0.w0
00 00 20 41 00 01 60 34 FREXPM.f32.sqrt.discard r1, r0
01 00 82 01 00 02 60 00 FRSQ.f32 r2, r1
80 00 22 41 00 00 60 00 FREXPE.f32.sqrt r0, r0^
81 82 c0 80 0a 00 64 02 FMA_RSCALE.f32.clamp_m1_1 r0, r1^, r2^, 0x0.neg, r0^
81 82 c0 80 0e 00 64 22 FMA_RSCALE.f32.left.wait0126 r0, r1^, r2^, 0x0.neg, r0^
82 83 04 05 00 01 7c 02 CSEL.u32.eq r1, r2^, r3^, u2.w0, u2.w1
82 83 04 05 08 01 7c 02 CSEL.u32.lt r1, r2^, r3^, u2.w0, u2.w1
82 83 04 05 48 01 7c 02 CSEL.s32.lt r1, r2^, r3^, u2.w0, u2.w1
3d 00 00 12 5a 02 18 07 LD_VAR_SPECIAL.v2.f32.sample.clobber.slot0.wait0 @r2:r3, r61, index:0x0
3d 00 00 3f 0a 02 10 07 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.center.retrieve.wait0 @r2:r3, r61, index:0x0
3d 00 00 3f 42 00 10 07 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.sample.store.wait0 @r0:r1, r61, index:0x0
3d 08 00 3f 22 00 10 07 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.centroid.store.wait0 @r0:r1, r61, index:0x8
bc bd 11 33 02 00 84 03 LD_ATTR_IMM.v4.f16.slot0 @r0:r1, r60^, r61^, index:0x1, table:0x1
80 3c 03 23 02 04 c0 03 LD_TILE.v3.f16.slot0 @r4:r5, r0^, r60, r3
00 c9 00 20 10 01 c6 00 IADD.v2u16 r1.h1, r0.h10, 0x7060504.b11
80 c0 00 08 10 01 a6 00 IADD.v2u16 r1.h0, r0^.h10, 0x0
02 02 00 04 20 02 a4 00 IADD.v2u16 r2.h0, r2, r2.h10
82 c0 05 00 00 02 e6 00 MKVEC.v2i16 r2, r2^.h0, 0x0.h0
b7 c0 05 00 00 02 e6 00 MKVEC.v2i16 r2, r55^.h0, 0x0.h0
b7 c0 05 10 00 02 e6 00 MKVEC.v2i16 r2, r55^.h1, 0x0.h0
c0 b7 05 00 00 02 e5 00 MKVEC.v2i16 r2, 0x0.h0, r55^.h0
c0 b7 05 04 00 02 e5 00 MKVEC.v2i16 r2, 0x0.h0, r55^.h1
b7 00 54 00 00 02 60 00 U16_TO_U32 r2, r55^.h0
b7 00 54 10 00 02 60 00 U16_TO_U32 r2, r55^.h1
b7 00 44 00 00 02 60 00 S16_TO_S32 r2, r55^.h0
b7 00 44 10 00 02 60 00 S16_TO_S32 r2, r55^.h1
c0 b7 01 08 00 02 e9 00 ISUB.s32 r2, 0x0, r55^.h0
c0 b7 01 0c 00 02 e9 00 ISUB.s32 r2, 0x0, r55^.h1
00 c0 c0 c0 c0 07 7e 01 MKVEC.v2i8 r7, r0.b3, 0x0.b0, 0x0
00 c0 c0 c0 80 06 7e 01 MKVEC.v2i8 r6, r0.b2, 0x0.b0, 0x0
00 c0 c0 c0 00 04 7e 01 MKVEC.v2i8 r4, r0.b0, 0x0.b0, 0x0
80 c0 c0 c0 40 05 7e 01 MKVEC.v2i8 r5, r0^.b1, 0x0.b0, 0x0
3d 00 00 ba 44 00 10 37 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.discard @r0:r1:r2:r3, r61, index:0x0
3d 10 00 7a 0c 04 10 03 LD_VAR_BUF_IMM.f32.slot1.v4.src_f32.center.retrieve @r4:r5:r6:r7, r61, index:0x10
c0 00 00 00 00 08 6d 00 IADD_IMM.i32 r8, 0x0, #0x0
c0 00 00 00 00 09 6d 00 IADD_IMM.i32 r9, 0x0, #0x0
3d 00 54 00 00 0a 60 00 U16_TO_U32 r10, r61.h0
3d 09 00 00 30 00 b8 2a BRANCHZ.eq.reconverge r61.h0, offset:9
0a 00 20 00 00 0b 60 28 MOV.i32.reconverge r11, r10
c0 00 e0 01 00 00 a1 26 NOP.wait
01 0b 00 33 02 0e c5 03 LD_TILE.v4.f16.slot0 @r14:r15, u0.w1, r11, u0.w0
0b 00 24 00 00 0c 60 00 CLZ.u32 r12, r11
02 8c c0 10 06 0c 6d 01 RSHIFT_XOR.i32.not_result r12, u1.w0, r12^.b00, 0x0
8b c0 8c 50 00 0b 6a 05 LSHIFT_AND.i32.wait0 r11, r11^, 0x0.b00, r12^
8f 89 00 28 00 09 f4 00 FADD.v2f16 r9, r15^, r9^
8e 88 00 28 00 08 f4 00 FADD.v2f16 r8, r14^, r8^
0b f8 ff ff 07 00 b8 2a BRANCHZ.reconverge r11, offset:-8
8a 00 2c 00 00 3e 60 00 POPCOUNT.i32 r62, r10^
be 00 59 00 00 3e 60 00 U32_TO_F32 r62, r62^
be 00 81 01 00 3e 60 00 FRCP.f16 r62, r62^.h00
89 3e c0 22 44 09 64 19 FMA.v2f16.wait12 r9, r9^, r62.h00, 0x0.neg
87 83 00 00 00 03 f0 00 FADD.f32 r3, r7^, r3^
83 09 00 08 00 03 f0 20 FADD.f32.wait0126 r3, r3^, r9.h1
3c 03 ea 00 01 3c d4 37 ATEST.discard @r60, r60, r3, atest_datum.w0
86 82 00 00 00 02 f0 00 FADD.f32 r2, r6^, r2^
84 80 00 00 00 00 f0 00 FADD.f32 r0, r4^, r0^
88 be c0 22 44 3f 64 01 FMA.v2f16 r63, r8^, r62^.h00, 0x0.neg
85 81 00 00 00 01 f0 00 FADD.f32 r1, r5^, r1^
81 3f 00 08 00 01 f0 00 FADD.f32 r1, r1^, r63.h1
80 bf 00 04 00 00 f0 00 FADD.f32 r0, r0^, r63^.h0
82 89 00 04 00 02 f0 24 FADD.f32.wait r2, r2^, r9^.h0
f0 00 3c 32 84 00 1b 3f BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
c0 00 00 00 00 36 6d 00 IADD_IMM.i32 r54, 0x0, #0x0
c0 f1 0f 80 10 00 b3 06 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1
00 00 00 1f 5a 3c 69 03 TEX_FETCH.slot0.32.2d @r0:r1:r2:r3, @r60:r61, u0
40 00 20 00 00 01 61 00 MOV.i32 r1, u32.w0
41 00 20 00 00 01 61 00 MOV.i32 r1, u32.w1
4a 00 20 00 00 01 61 00 MOV.i32 r1, u37.w0
30 00 37 0f c1 0c 24 07 ATOM_RETURN.i32.slot0.axchg.wait0 @r55, @r12, [r48:r49], offset:0x0
32 00 00 02 81 0c 2c 07 ATOM.i32.slot0.aadd.wait0 @r12, [r50:r51], offset:0x0
32 00 00 00 01 0c 28 07 ATOM1_RETURN.i32.slot0.ainc.wait0 @r12, [r50:r51], offset:0x0
32 00 00 00 01 00 28 07 ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, [r50:r51], offset:0x0
02 00 00 11 da 00 d5 27 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0
02 20 00 11 da 00 d5 07 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0
02 20 00 11 c2 00 d5 23 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0126 @r0, u1, u0.w0
80 c0 c0 02 06 00 e6 09 ICMP_OR.u32.ne.m1.wait1 r0, r0^, 0x0, 0x0
82 83 80 80 02 00 e8 01 ICMP_AND.s32.gt.i1 r0, r2^, r3^, r0^
82 c0 c0 03 06 00 f6 09 ICMP_MULTI.u32.ne.u1.wait1 r0, r2^, 0x0, 0x0
84 86 c0 03 02 02 f4 01 ICMP_MULTI.u32.gt.u1 r2, r4^, r6^, 0x0
85 87 82 02 02 02 f0 01 ICMP_MULTI.u32.gt.m1 r2, r5^, r7^, r2^
83 c0 80 02 06 00 f2 01 ICMP_MULTI.u32.ne.m1 r0, r3^, 0x0, r0^
80 82 c0 03 02 00 f4 01 ICMP_MULTI.u32.gt.u1 r0, r0^, r2^, 0x0
81 83 80 82 02 04 f0 01 ICMP_MULTI.s32.gt.m1 r4, r1^, r3^, r0^
80 c0 c0 6a 07 00 e6 09 FCMP_OR.v2f16.ne.m1.wait1 r0, r0^, 0x0, 0x0
81 81 80 6e 03 00 e8 01 FCMP_AND.v2f16.gt.m1 r0, r1^, r1^.h11, r0^
80 c0 c0 6a 07 00 e6 09 FCMP_OR.v2f16.ne.m1.wait1 r0, r0^, 0x0, 0x0
81 81 80 6e 03 00 e8 01 FCMP_AND.v2f16.gt.m1 r0, r1^, r1^.h11, r0^
c4 c0 80 52 70 00 6b 01 LSHIFT_AND.v4i8 r0, 0x1000000.b3333, 0x0.b00, r0^
80 81 82 80 24 00 78 01 MUX.v4i8 r0, r0^, r1^, r2^
c0 c0 00 00 02 02 8f 03 LEA_PKA.slot0 @r2:r3, 0x0, 0x0

View file

@ -126,6 +126,7 @@ c0 01 00 00 00 c4 10 51 IADD_IMM.i32.reconverge r4, 0x0, #0x1
00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.discard r1, r0
01 00 02 00 00 c2 9c 00 FRSQ.f32 r2, r1
40 00 02 01 00 c0 99 00 FREXPE.f32.sqrt r0, r0^
41 42 c0 40 06 c0 60 01 FMA_RSCALE.f32.clamp_m1_1 r0, r1^, r2^, 0x0.neg, r0^
41 42 c0 40 04 c0 62 41 FMA_RSCALE_LEFT.f32.wait0126 r0, r1^, r2^, 0x0.neg, r0^
42 43 84 85 00 c1 50 01 CSEL.u32.eq r1, r2^, r3^, u2.w0, u2.w1
42 43 84 85 04 c1 50 01 CSEL.u32.lt r1, r2^, r3^, u2.w0, u2.w1
@ -213,17 +214,17 @@ c0 00 00 00 00 c9 10 01 IADD_IMM.i32 r9, 0x0, #0x0
f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0
c0 f1 00 00 10 c1 2f 08 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1
80 00 c0 17 34 7c 25 01 TEX_FETCH.slot0.f.32.2d @r0:r1:r2:r3, @r60:r61, u0
80 00 c0 13 34 7c 25 01 TEX_FETCH.slot0.32.2d @r0:r1:r2:r3, @r60:r61, u0
80 00 00 00 00 c1 91 02 MOV.i32 r1, u32.w0
81 00 00 00 00 c1 91 02 MOV.i32 r1, u32.w1
8a 00 00 00 00 c1 91 02 MOV.i32 r1, u37.w0
30 00 f7 1b 02 cc 20 09 ATOM_RETURN.i32.slot0.axchg.wait0 @r55, @r12, [r48:r49], offset:0x0
32 00 80 18 02 4c 68 08 ATOM.i32.slot0.aadd.wait0 @r12, [r50:r51], offset:0x0
32 00 00 18 02 8c 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @r12, [r50:r51], offset:0x0
32 00 00 18 00 80 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @, [r50:r51], offset:0x0
82 00 80 15 b4 80 38 49 VAR_TEX_SINGLE.slot0.skip.sample_store.f.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0
82 20 80 15 b4 80 38 09 VAR_TEX_SINGLE.slot0.skip.sample_store.f.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0
82 20 80 1d 84 80 38 41 VAR_TEX_SINGLE.slot0.skip.sample_store.s.32.2d.computed.wait0126 @r0, u1, u0.w0
32 00 00 18 02 80 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, [r50:r51], offset:0x0
82 00 80 11 b4 80 38 49 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0
82 20 80 11 b4 80 38 09 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0
82 20 80 11 84 80 38 41 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0126 @r0, u1, u0.w0
40 c0 c0 80 03 c0 f0 10 ICMP_OR.u32.ne.m1.wait1 r0, r0^, 0x0, 0x0
42 43 40 01 01 c0 f8 00 ICMP_AND.s32.gt.i1 r0, r2^, r3^, r0^
42 c0 c0 c2 03 c0 f0 10 ICMP_MULTI.u32.ne.u1.wait1 r0, r2^, 0x0, 0x0

View file

@ -33,8 +33,18 @@ parse_hex(const char *in)
int
main(int argc, const char **argv)
{
if (argc < 2) {
fprintf(stderr, "Expected case list\n");
if (argc < 3) {
fprintf(stderr, "Expected case list and arch version\n");
return 1;
}
if (argv[2][0] != 'v') {
fprintf(stderr, "Invalid arch version: %s\n", argv[2]);
return 1;
}
unsigned arch = atoi(&argv[2][1]);
if (arch < 9 || arch > 15) {
fprintf(stderr, "Non-supported arch version: %d\n", arch);
return 1;
}
@ -65,7 +75,10 @@ main(int argc, const char **argv)
uint64_t bin = parse_hex(line);
FILE *outputp = open_memstream(&output, &sz);
va_disasm_instr(outputp, bin);
if (arch < 15)
va_disasm_instr(outputp, bin);
else
va_disasm_instr_v15(outputp, bin);
fprintf(outputp, "\n");
fclose(outputp);

View file

@ -12,6 +12,7 @@
static inline void
add_imm(bi_context *ctx)
{
ctx->arch = 10;
struct hash_table_u64 *stats = _mesa_hash_table_u64_create(ctx);
bi_foreach_instr_global(ctx, I) {
va_lower_constants(ctx, I, stats, UINT32_MAX);

View file

@ -26,7 +26,9 @@ strip_discard(bi_context *ctx)
do { \
void *mem_ctx = ralloc_context(NULL); \
bi_builder *A = bit_builder(mem_ctx); \
A->shader->arch = 10; \
bi_builder *B = bit_builder(mem_ctx); \
B->shader->arch = 10; \
{ \
UNUSED bi_builder *b = A; \
test; \

View file

@ -1,5 +1,6 @@
/*
* Copyright (C) 2021 Collabora, Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
@ -9,9 +10,9 @@
#include <gtest/gtest.h>
#define CASE(instr, expected) \
#define CASE_ARCH(instr, arch, expected) \
do { \
uint64_t _value = va_pack_instr(instr, 10); \
uint64_t _value = va_pack_instr(instr, arch); \
if (_value != expected) { \
fprintf(stderr, "Got %" PRIx64 ", expected %" PRIx64 "\n", _value, \
(uint64_t)expected); \
@ -45,124 +46,153 @@ class ValhallPacking : public testing::Test {
TEST_F(ValhallPacking, Moves)
{
CASE(bi_mov_i32_to(b, bi_register(1), bi_register(2)),
0x0091c10000000002ULL);
CASE(bi_mov_i32_to(b, bi_register(1),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false)),
0x0091c1000000008aULL);
bi_instr *I = bi_mov_i32_to(b, bi_register(1), bi_register(2));
CASE_ARCH(I, 10, 0x0091c10000000002ULL);
CASE_ARCH(I, 15, 0x0060010000200002ULL);
I = bi_mov_i32_to(b, bi_register(1),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false));
CASE_ARCH(I, 10, 0x0091c1000000008aULL);
CASE_ARCH(I, 15, 0x006101000020000aULL);
}
TEST_F(ValhallPacking, Fadd)
{
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2)),
0x00a4c00000000201ULL);
CASE(
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))),
0x00a4c02000000201ULL);
CASE(
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))),
0x00a4c01000000201ULL);
bi_instr *I =
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2));
CASE_ARCH(I, 10, 0x00a4c00000000201ULL);
CASE_ARCH(I, 15, 0x00f0000000000201ULL);
CASE(bi_fadd_v2f16_to(b, bi_register(0),
bi_swz_16(bi_register(1), false, false),
bi_swz_16(bi_register(0), true, true)),
0x00a5c0000c000001ULL);
I =
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2)));
CASE_ARCH(I, 10, 0x00a4c02000000201ULL);
CASE_ARCH(I, 15, 0x00f0002000000201ULL);
CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0)),
0x00a5c00028000001ULL);
I =
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2)));
CASE_ARCH(I, 10, 0x00a4c01000000201ULL);
CASE_ARCH(I, 15, 0x00f0001000000201ULL);
CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1),
bi_swz_16(bi_register(0), true, false)),
0x00a5c00024000001ULL);
I = bi_fadd_v2f16_to(b, bi_register(0),
bi_swz_16(bi_register(1), false, false),
bi_swz_16(bi_register(0), true, true));
CASE_ARCH(I, 10, 0x00a5c0000c000001ULL);
CASE_ARCH(I, 15, 0x00f400000c000001ULL);
CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))),
bi_neg(zero)),
0x00a5c0902800c040ULL);
I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0));
CASE_ARCH(I, 10, 0x00a5c00028000001ULL);
CASE_ARCH(I, 15, 0x00f4000028000001ULL);
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero),
0x00a4c0000000c001ULL);
I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1),
bi_swz_16(bi_register(0), true, false));
CASE_ARCH(I, 10, 0x00a5c00024000001ULL);
CASE_ARCH(I, 15, 0x00f4000024000001ULL);
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero)),
0x00a4c0100000c001ULL);
I = bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))),
bi_neg(zero));
CASE_ARCH(I, 10, 0x00a5c0902800c040ULL);
CASE_ARCH(I, 15, 0x00f600902800c080ULL);
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
bi_half(bi_register(0), true)),
0x00a4c00008000001ULL);
I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero);
CASE_ARCH(I, 10, 0x00a4c0000000c001ULL);
CASE_ARCH(I, 15, 0x00f200000000c001ULL);
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
bi_half(bi_register(0), false)),
0x00a4c00004000001ULL);
I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero));
CASE_ARCH(I, 10, 0x00a4c0100000c001ULL);
CASE_ARCH(I, 15, 0x00f200100000c001ULL);
I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
bi_half(bi_register(0), true));
CASE_ARCH(I, 10, 0x00a4c00008000001ULL);
CASE_ARCH(I, 15, 0x00f0000008000001ULL);
I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
bi_half(bi_register(0), false));
CASE_ARCH(I, 10, 0x00a4c00004000001ULL);
CASE_ARCH(I, 15, 0x00f0000004000001ULL);
}
TEST_F(ValhallPacking, Clper)
{
CASE(bi_clper_i32_to(b, bi_register(0), bi_register(0), bi_byte(n4567, 0),
BI_INACTIVE_RESULT_F1, BI_LANE_OP_NONE,
BI_SUBGROUP_SUBGROUP16),
0x00a0c030128fc900);
bi_instr *I = bi_clper_i32_to(b, bi_register(0), bi_register(0),
bi_byte(n4567, 0), BI_INACTIVE_RESULT_F1,
BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP16);
CASE_ARCH(I, 10, 0x00a0c030128fc900);
CASE_ARCH(I, 15, 0x00e20030028fc900);
}
TEST_F(ValhallPacking, Clamps)
{
bi_instr *I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
bi_neg(bi_abs(bi_register(2))));
CASE(I, 0x00a4c03000000201ULL);
CASE_ARCH(I, 10, 0x00a4c03000000201ULL);
CASE_ARCH(I, 15, 0x00f0003000000201ULL);
I->clamp = BI_CLAMP_CLAMP_M1_1;
CASE(I, 0x00a4c03200000201ULL);
CASE_ARCH(I, 10, 0x00a4c03200000201ULL);
CASE_ARCH(I, 15, 0x00f0003080000201ULL);
}
TEST_F(ValhallPacking, Misc)
{
CASE(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false),
bi_neg(zero)),
0x00b2c10400c08841ULL);
bi_instr *I = bi_fma_f32_to(
b, bi_register(1), bi_discard(bi_register(1)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false), bi_neg(zero));
CASE_ARCH(I, 10, 0x00b2c10400c08841ULL);
CASE_ARCH(I, 15, 0x0166010400c00881ULL);
CASE(bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))),
BI_ROUND_RTN),
0x0090c240800d0042ULL);
I = bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))),
BI_ROUND_RTN);
CASE_ARCH(I, 10, 0x0090c240800d0042ULL);
CASE_ARCH(I, 15, 0x00600242004d0082ULL);
CASE(bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0),
BI_ROUND_RTN),
0x00904000a00f0000ULL);
I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0),
BI_ROUND_RTN);
CASE_ARCH(I, 10, 0x00904000a00f0000ULL);
/* Removed on v11 */
CASE(
bi_fround_v2f16_to(b, bi_half(bi_register(0), false),
bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN),
0x00904000900f0001ULL);
I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false),
bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN);
CASE_ARCH(I, 10, 0x00904000900f0001ULL);
/* Removed on v11 */
}
TEST_F(ValhallPacking, FaddImm)
{
CASE(bi_fadd_imm_f32_to(b, bi_register(2), bi_discard(bi_register(2)),
0x4847C6C0),
0x0114C24847C6C042ULL);
bi_instr *I = bi_fadd_imm_f32_to(b, bi_register(2),
bi_discard(bi_register(2)), 0x4847C6C0);
CASE_ARCH(I, 10, 0x0114C24847C6C042ULL);
CASE_ARCH(I, 15, 0x0064024847c6c082ULL);
CASE(bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)),
0x70AC6784),
0x0115C270AC678442ULL);
I = bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)),
0x70AC6784);
CASE_ARCH(I, 10, 0x0115C270AC678442ULL);
CASE_ARCH(I, 15, 0x00620270ac678482ULL);
}
TEST_F(ValhallPacking, Comparions)
{
CASE(bi_icmp_or_v2s16_to(b, bi_register(2),
bi_discard(bi_swz_16(bi_register(3), true, false)),
bi_discard(bi_swz_16(bi_register(2), true, false)),
zero, BI_CMPF_GT, BI_RESULT_TYPE_M1),
0x00f9c21184c04243);
bi_instr *I = bi_icmp_or_v2s16_to(
b, bi_register(2), bi_discard(bi_swz_16(bi_register(3), true, false)),
bi_discard(bi_swz_16(bi_register(2), true, false)), zero, BI_CMPF_GT,
BI_RESULT_TYPE_M1);
CASE_ARCH(I, 10, 0x00f9c21184c04243);
CASE_ARCH(I, 15, 0x01e40212c6c08283);
CASE(bi_fcmp_or_v2f16_to(b, bi_register(2),
bi_discard(bi_swz_16(bi_register(3), true, false)),
bi_discard(bi_swz_16(bi_register(2), false, false)),
zero, BI_CMPF_GT, BI_RESULT_TYPE_M1),
0x00f5c20190c04243);
I = bi_fcmp_or_v2f16_to(b, bi_register(2),
bi_discard(bi_swz_16(bi_register(3), true, false)),
bi_discard(bi_swz_16(bi_register(2), false, false)),
zero, BI_CMPF_GT, BI_RESULT_TYPE_M1);
CASE_ARCH(I, 10, 0x00f5c20190c04243);
CASE_ARCH(I, 15, 0x01e4020352c08283);
}
TEST_F(ValhallPacking, Conversions)
{
CASE(bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2))),
0x0090c22000070042);
bi_instr *I =
bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)));
CASE_ARCH(I, 10, 0x0090c22000070042);
/* Removed on v11 */
}
TEST_F(ValhallPacking, BranchzI16)
@ -170,88 +200,105 @@ TEST_F(ValhallPacking, BranchzI16)
bi_instr *I =
bi_branchz_i16(b, bi_half(bi_register(2), false), bi_null(), BI_CMPF_EQ);
I->branch_offset = 1;
CASE(I, 0x001fc03000000102);
CASE_ARCH(I, 10, 0x001fc03000000102);
CASE_ARCH(I, 15, 0x02b8003000000102);
}
TEST_F(ValhallPacking, BranchzI16Backwards)
{
bi_instr *I = bi_branchz_i16(b, zero, bi_null(), BI_CMPF_EQ);
I->branch_offset = -8;
CASE(I, 0x001fc017fffff8c0);
CASE_ARCH(I, 10, 0x001fc017fffff8c0);
CASE_ARCH(I, 15, 0x02b90017fffff8c0);
}
TEST_F(ValhallPacking, Blend)
{
CASE(
bi_instr *I =
bi_blend_to(b, bi_null(), bi_register(0), bi_register(60),
bi_fau(BIR_FAU_BLEND_0, false), bi_fau(BIR_FAU_BLEND_0, true),
bi_null(), BI_REGISTER_FORMAT_F16, 2, 0),
0x007f4004333c00f0);
bi_null(), BI_REGISTER_FORMAT_F16, 2, 0);
CASE_ARCH(I, 10, 0x007f4004333c00f0);
CASE_ARCH(I, 15, 0x031b0082333c00f0);
}
TEST_F(ValhallPacking, Mux)
{
CASE(bi_mux_i32_to(b, bi_register(0), bi_discard(bi_register(0)),
bi_discard(bi_register(4)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false),
BI_MUX_BIT),
0x00b8c00300804440ull);
bi_instr *I = bi_mux_i32_to(
b, bi_register(0), bi_discard(bi_register(0)), bi_discard(bi_register(4)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false), BI_MUX_BIT);
CASE_ARCH(I, 10, 0x00b8c00300804440ull);
CASE_ARCH(I, 15, 0x017c000c80008480ull);
}
TEST_F(ValhallPacking, AtestFP16)
{
CASE(bi_atest_to(b, bi_register(60), bi_register(60),
bi_half(bi_register(1), true),
bi_fau(BIR_FAU_ATEST_PARAM, false)),
0x007dbc0208ea013c);
bi_instr *I = bi_atest_to(b, bi_register(60), bi_register(60),
bi_half(bi_register(1), true),
bi_fau(BIR_FAU_ATEST_PARAM, false));
CASE_ARCH(I, 10, 0x007dbc0208ea013c);
CASE_ARCH(I, 15, 0x03d43c0108ea013c);
}
TEST_F(ValhallPacking, AtestFP32)
{
CASE(bi_atest_to(b, bi_register(60), bi_register(60), one,
bi_fau(BIR_FAU_ATEST_PARAM, false)),
0x007dbc0200ead03c);
bi_instr *I = bi_atest_to(b, bi_register(60), bi_register(60), one,
bi_fau(BIR_FAU_ATEST_PARAM, false));
CASE_ARCH(I, 10, 0x007dbc0200ead03c);
CASE_ARCH(I, 15, 0x03d63c0100ead03c);
}
TEST_F(ValhallPacking, Transcendentals)
{
CASE(bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true),
0x0099c10001000000);
bi_instr *I =
bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true);
CASE_ARCH(I, 10, 0x0099c10001000000);
CASE_ARCH(I, 15, 0x0060010041200000);
CASE(bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false,
true),
0x0099c00001020040);
I = bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false,
true);
CASE_ARCH(I, 10, 0x0099c00001020040);
CASE_ARCH(I, 15, 0x0060000041220080);
CASE(bi_frsq_f32_to(b, bi_register(2), bi_register(1)), 0x009cc20000020001);
I = bi_frsq_f32_to(b, bi_register(2), bi_register(1));
CASE_ARCH(I, 10, 0x009cc20000020001);
CASE_ARCH(I, 15, 0x0060020001820001);
CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)),
bi_discard(bi_register(2)), bi_neg(zero),
bi_discard(bi_register(0)), BI_SPECIAL_LEFT),
0x0162c00440c04241);
I = bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)),
bi_discard(bi_register(2)), bi_neg(zero),
bi_discard(bi_register(0)), BI_SPECIAL_LEFT);
CASE_ARCH(I, 10, 0x0162c00440c04241);
CASE_ARCH(I, 15, 0x0264000e80c08281);
I = bi_fma_rscale_f32_to(b, bi_register(0), bi_register(1), bi_register(2),
bi_neg(zero), bi_discard(bi_register(0)),
BI_SPECIAL_N);
CASE_ARCH(I, 10, 0x0161c00440c00201);
CASE_ARCH(I, 15, 0x0264000d80c00201);
}
TEST_F(ValhallPacking, Csel)
{
CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)),
bi_discard(bi_register(3)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true),
BI_CMPF_EQ),
0x0150c10085844342);
bi_instr *I = bi_csel_u32_to(
b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_EQ);
CASE_ARCH(I, 10, 0x0150c10085844342);
CASE_ARCH(I, 15, 0x027c010005048382);
CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)),
bi_discard(bi_register(3)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true),
BI_CMPF_LT),
0x0150c10485844342);
I = bi_csel_u32_to(
b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT);
CASE_ARCH(I, 10, 0x0150c10485844342);
CASE_ARCH(I, 15, 0x027c010805048382);
CASE(bi_csel_s32_to(b, bi_register(1), bi_discard(bi_register(2)),
bi_discard(bi_register(3)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true),
BI_CMPF_LT),
0x0158c10485844342);
I = bi_csel_s32_to(
b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT);
CASE_ARCH(I, 10, 0x0158c10485844342);
CASE_ARCH(I, 15, 0x027c014805048382);
}
TEST_F(ValhallPacking, LdAttrImm)
@ -261,34 +308,67 @@ TEST_F(ValhallPacking, LdAttrImm)
bi_discard(bi_register(61)), BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4, 1);
I->table = 1;
CASE(I, 0x0066800433117d7c);
CASE_ARCH(I, 10, 0x0066800433117d7c);
CASE_ARCH(I, 15, 0x038400023311bdbc);
}
TEST_F(ValhallPacking, LdVarBufImmF16)
{
CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(2), bi_register(61),
BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER,
BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE,
BI_VECSIZE_V4, 0),
0x005d82143300003d);
bi_instr *I = bi_ld_var_buf_imm_f16_to(
b, bi_register(2), bi_register(61), BI_REGISTER_FORMAT_F16,
BI_SAMPLE_CENTER, BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE, BI_VECSIZE_V4,
0);
CASE_ARCH(I, 10, 0x005d82143300003d);
CASE_ARCH(I, 15, 0x0310020a3f00003d);
CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE,
BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
BI_VECSIZE_V4, 0),
0x005d80843300003d);
I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE,
BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
BI_VECSIZE_V4, 0);
CASE_ARCH(I, 10, 0x005d80843300003d);
CASE_ARCH(I, 15, 0x031000423f00003d);
CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID,
BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
BI_VECSIZE_V4, 8),
0x005d80443308003d);
I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID,
BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
BI_VECSIZE_V4, 8);
CASE_ARCH(I, 10, 0x005d80443308003d);
CASE_ARCH(I, 11, 0x005d80443300083d);
CASE_ARCH(I, 15, 0x031000223f00083d);
}
TEST_F(ValhallPacking, LdVarBufFlatImmFormat)
{
bi_instr *I = bi_ld_var_buf_flat_imm_to(
b, bi_register(0), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 0x12);
CASE_ARCH(I, 14, 0x0040800832001200);
CASE_ARCH(I, 15, 0x033900043a0012c0);
I = bi_ld_var_buf_flat_imm_to(b, bi_register(0), BI_REGISTER_FORMAT_F16,
BI_VECSIZE_V4, 0x12);
CASE_ARCH(I, 14, 0x0040800433001200);
CASE_ARCH(I, 15, 0x033900023b0012c0);
}
TEST_F(ValhallPacking, LdVarBufFlat)
{
bi_instr *I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61),
BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4);
CASE_ARCH(I, 14, 0x005f80083200003d);
CASE_ARCH(I, 15, 0x031400043a00003d);
I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61),
BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4);
CASE_ARCH(I, 14, 0x005f80043300003d);
CASE_ARCH(I, 15, 0x031400023b00003d);
}
TEST_F(ValhallPacking, LeaBufImm)
{
CASE(bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59))),
0x005e84040000007b);
bi_instr *I =
bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59)));
CASE_ARCH(I, 10, 0x005e84040000007b);
CASE_ARCH(I, 15, 0x03080402000000bb);
}
TEST_F(ValhallPacking, StoreMemoryAccess)
@ -296,61 +376,94 @@ TEST_F(ValhallPacking, StoreMemoryAccess)
bi_instr *I = bi_store_i96(b, bi_register(0), bi_discard(bi_register(4)),
bi_discard(bi_register(5)), BI_SEG_NONE, 0);
I->mem_access = VA_MEMORY_ACCESS_ESTREAM;
CASE(I, 0x0061400632000044);
CASE_ARCH(I, 10, 0x0061400632000044);
CASE_ARCH(I, 15, 0x0320009302000084);
}
TEST_F(ValhallPacking, Convert16To32)
{
CASE(bi_u16_to_u32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), false))),
0x0090c20000140077);
bi_instr *I = bi_u16_to_u32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), false)));
CASE_ARCH(I, 10, 0x0090c20000140077);
CASE_ARCH(I, 15, 0x00600200005400b7);
CASE(bi_u16_to_u32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), true))),
0x0090c20010140077);
I = bi_u16_to_u32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), true)));
CASE_ARCH(I, 10, 0x0090c20010140077);
CASE_ARCH(I, 15, 0x00600200105400b7);
CASE(bi_u16_to_f32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), false))),
0x0090c20000150077);
I = bi_u16_to_f32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), false)));
CASE_ARCH(I, 10, 0x0090c20000150077);
/* Removed on v11 */
CASE(bi_u16_to_f32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), true))),
0x0090c20010150077);
I = bi_u16_to_f32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), true)));
CASE_ARCH(I, 10, 0x0090c20010150077);
/* Removed on v11 */
CASE(bi_s16_to_s32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), false))),
0x0090c20000040077);
I = bi_s16_to_s32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), false)));
CASE_ARCH(I, 10, 0x0090c20000040077);
CASE_ARCH(I, 15, 0x00600200004400b7);
CASE(bi_s16_to_s32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), true))),
0x0090c20010040077);
I = bi_s16_to_s32_to(b, bi_register(2),
bi_discard(bi_half(bi_register(55), true)));
CASE_ARCH(I, 10, 0x0090c20010040077);
CASE_ARCH(I, 15, 0x00600200104400b7);
}
TEST_F(ValhallPacking, Swizzle8)
{
CASE(bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero,
zero, BI_CMPF_NE, BI_RESULT_TYPE_I1),
0x00f2c14300c0c000);
bi_instr *I =
bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero,
zero, BI_CMPF_NE, BI_RESULT_TYPE_I1);
CASE_ARCH(I, 10, 0x00f2c14300c0c000);
/* Removed on v11 */
}
TEST_F(ValhallPacking, FauPage1)
{
CASE(bi_mov_i32_to(b, bi_register(1),
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false)),
0x0291c10000000080ULL);
bi_instr *I = bi_mov_i32_to(
b, bi_register(1), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false));
CASE_ARCH(I, 10, 0x0291c10000000080ULL);
CASE_ARCH(I, 15, 0x0061010000200040ULL);
}
TEST_F(ValhallPacking, LdTileV3F16)
{
CASE(bi_ld_tile_to(b, bi_register(4), bi_discard(bi_register(0)),
bi_register(60), bi_register(3), BI_REGISTER_FORMAT_F16,
BI_VECSIZE_V3),
0x0078840423033c40);
bi_instr *I = bi_ld_tile_to(b, bi_register(4), bi_discard(bi_register(0)),
bi_register(60), bi_register(3),
BI_REGISTER_FORMAT_F16, BI_VECSIZE_V3);
CASE_ARCH(I, 10, 0x0078840423033c40);
CASE_ARCH(I, 15, 0x03c0040223033c80);
}
TEST_F(ValhallPacking, Rhadd8)
{
CASE(bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)),
bi_discard(bi_register(0)), BI_ROUND_RTP),
0x00aac000400b4041);
bi_instr *I = bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)),
bi_discard(bi_register(0)), BI_ROUND_RTP);
CASE_ARCH(I, 10, 0x00aac000400b4041);
/* Removed on v11 */
}
TEST_F(ValhallPacking, Atomics)
{
bi_instr *I =
bi_atom1_return_i64_to(b, bi_register(0), bi_discard(bi_register(2)),
bi_register(3), BI_ATOM_OPC_AINC, 2);
CASE_ARCH(I, 10, 0x0069800428000042);
CASE_ARCH(I, 15, 0x0328000220000082);
I = bi_atom_return_i32_to(b, bi_register(0), bi_discard(bi_register(1)),
bi_register(2), bi_register(3), BI_ATOM_OPC_AXCHG,
1);
CASE_ARCH(I, 10, 0x0120c1021bc00002);
CASE_ARCH(I, 15, 0x032401c10f000002);
I = bi_atom_return_i64_to(b, bi_register(0), bi_register(2), bi_register(6),
bi_register(7), BI_ATOM_OPC_ACMPXCHG, 2);
CASE_ARCH(I, 10, 0x0120c2182fc00006);
CASE_ARCH(I, 15, 0x032802cc2f000006);
}

View file

@ -9,9 +9,9 @@
#include <gtest/gtest.h>
#define CASE(instr, expected) \
#define CASE_ARCH(instr, arch, expected) \
do { \
if (va_validate_fau(instr) != expected) { \
if (va_validate_fau(instr, arch) != expected) { \
fprintf(stderr, "Incorrect validation for:\n"); \
bi_print_instr(instr, stderr); \
fprintf(stderr, "\n"); \
@ -19,8 +19,8 @@
} \
} while (0)
#define VALID(instr) CASE(instr, true)
#define INVALID(instr) CASE(instr, false)
#define VALID(instr) CASE_ARCH(instr, 10, true)
#define INVALID(instr) CASE_ARCH(instr, 10, false)
class ValidateFau : public testing::Test {
protected:

View file

@ -13,9 +13,9 @@
extern "C" {
#endif
bool va_validate_fau(bi_instr *I);
bool va_validate_fau(bi_instr *I, unsigned arch);
void va_validate(FILE *fp, bi_context *ctx);
void va_repair_fau(bi_builder *b, bi_instr *I);
void va_repair_fau(bi_builder *b, bi_instr *I, unsigned arch);
void va_fuse_add_imm(bi_instr *I);
void va_lower_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts, uint32_t min_fau_count);
void va_count_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts);
@ -28,14 +28,15 @@ void va_gather_hsr_info(bi_context *ctx, struct pan_shader_info *info);
uint64_t va_pack_instr(const bi_instr *I, unsigned arch);
static inline unsigned
va_fau_page(enum bir_fau value)
va_fau_page(enum bir_fau value, unsigned arch)
{
/* Uniform slots of FAU have a 7-bit index. The top 2-bits are the page; the
* bottom 5-bits are specified in the source.
*/
if (value & BIR_FAU_UNIFORM) {
unsigned value_shift = arch >= 15 ? 6 : 5;
unsigned slot = value & ~BIR_FAU_UNIFORM;
unsigned page = slot >> 5;
unsigned page = slot >> value_shift;
assert(page <= 3);
return page;
@ -57,11 +58,11 @@ va_fau_page(enum bir_fau value)
}
static inline unsigned
va_select_fau_page(const bi_instr *I)
va_select_fau_page(const bi_instr *I, unsigned arch)
{
bi_foreach_src(I, s) {
if (I->src[s].type == BI_INDEX_FAU)
return va_fau_page((enum bir_fau)I->src[s].value);
return va_fau_page((enum bir_fau)I->src[s].value, arch);
}
return 0;
@ -77,7 +78,7 @@ struct va_stats {
unsigned nr_fau_uniforms;
};
void va_count_instr_stats(bi_instr *I, struct va_stats *stats);
void va_count_instr_stats(bi_instr *I, unsigned arch, struct va_stats *stats);
#ifdef __cplusplus
} /* extern C */

View file

@ -77,6 +77,8 @@ walk_bir_shader(bi_context *ctx, struct pan_shader_info *info)
if (instr->sample == BI_SAMPLE_CENTROID)
info->fs.hsr.centroid_interpolation = true;
FALLTHROUGH;
case BI_OPCODE_LD_VAR_BUF_FLAT:
case BI_OPCODE_LD_VAR_BUF_FLAT_IMM:
case BI_OPCODE_LD_VAR_FLAT:
case BI_OPCODE_LD_VAR_FLAT_IMM:
if (!found_atest)

View file

@ -520,7 +520,7 @@ va_assign_slots(bi_context *ctx)
bi_foreach_instr_global(ctx, I) {
if (I->op == BI_OPCODE_BARRIER) {
I->slot = 7;
I->slot = (ctx->arch >= 15) ? VA_SLOT_V15_SLOT7 : VA_SLOT_SLOT7;
} else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) {
I->slot = 0;
} else if (bi_get_opcode_props(I)->message) {

View file

@ -211,7 +211,7 @@ va_resolve_constant(bi_builder *b, uint32_t value, struct va_src_info info,
static uint32_t
va_resolve_swizzles(bi_context *ctx, bi_instr *I, unsigned s)
{
struct va_src_info info = va_src_info(I->op, s);
struct va_src_info info = va_src_info(I->op, s, ctx->arch);
uint32_t value = I->src[s].value;
enum bi_swizzle swz = I->src[s].swizzle;
@ -257,9 +257,10 @@ va_lower_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts,
/* abs(#c) is pointless, but -#c occurs in transcendental sequences */
assert(!I->src[s].abs && "redundant .abs modifier");
bool is_signed = valhall_opcodes[I->op].is_signed;
bool staging = (s < valhall_opcodes[I->op].nr_staging_srcs);
struct va_src_info info = va_src_info(I->op, s);
bool is_signed = get_valhall_opcode(I->op, ctx->arch).is_signed;
bool staging =
(s < get_valhall_opcode(I->op, ctx->arch).nr_staging_srcs);
struct va_src_info info = va_src_info(I->op, s, ctx->arch);
const uint32_t value = va_resolve_swizzles(ctx, I, s);
const uint32_t count = (uintptr_t)_mesa_hash_table_u64_search(counts, value);
@ -294,12 +295,13 @@ va_count_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts)
if (I->src[s].type != BI_INDEX_CONSTANT)
continue;
const bool staging = (s < valhall_opcodes[I->op].nr_staging_srcs);
const bool staging =
(s < get_valhall_opcode(I->op, ctx->arch).nr_staging_srcs);
if (staging)
continue;
bool is_signed = valhall_opcodes[I->op].is_signed;
struct va_src_info info = va_src_info(I->op, s);
bool is_signed = get_valhall_opcode(I->op, ctx->arch).is_signed;
struct va_src_info info = va_src_info(I->op, s, ctx->arch);
uint32_t value = va_resolve_swizzles(ctx, I, s);
bi_index cons = va_lookup_constant(value, info, is_signed);

View file

@ -78,7 +78,7 @@ va_lower_split_64bit(bi_context *ctx)
if (bi_is_null(I->src[s]) || s >= 4)
continue;
struct va_src_info info = va_src_info(I->op, s);
struct va_src_info info = va_src_info(I->op, s, ctx->arch);
/* Only split if the instruction expects 64-bit inputs as two separate
* sources. */

View file

@ -179,7 +179,7 @@ va_mark_last(bi_context *ctx)
break;
/* Only need to unmark split registers. */
if (va_src_info(I->op, s).size == VA_SIZE_64 &&
if (va_src_info(I->op, s, ctx->arch).size == VA_SIZE_64 &&
bi_count_read_registers(I, s) == 1) {
bool both_discard = I->src[s].discard && I->src[s + 1].discard;

View file

@ -286,7 +286,7 @@ va_fuse_cmp(bi_context *ctx, bi_instr **lut, const BITSET_WORD *multiple,
static bool
va_propagate_replicate_wide(bi_context *ctx, bi_instr **lut, bi_instr *I)
{
struct va_opcode_info info = valhall_opcodes[I->op];
struct va_opcode_info info = get_valhall_opcode(I->op, ctx->arch);
bool progress = false;
bi_foreach_ssa_src(I, s) {

View file

@ -74,6 +74,15 @@ va_pack_reg(const bi_instr *I, bi_index idx)
return idx.value;
}
static unsigned
va_pack_reg_v15(const bi_instr *I, bi_index idx)
{
pack_assert(I, idx.type == BI_INDEX_REGISTER);
pack_assert(I, idx.value < 128);
return idx.value;
}
static unsigned
va_pack_fau_special(const bi_instr *I, enum bir_fau fau)
{
@ -124,6 +133,21 @@ va_pack_fau_64(const bi_instr *I, bi_index idx)
return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1);
}
static unsigned
va_pack_fau_64_v15(const bi_instr *I, bi_index idx)
{
pack_assert(I, idx.type == BI_INDEX_FAU);
unsigned val = (idx.value & BITFIELD_MASK(6));
if (idx.value & BIR_FAU_IMMEDIATE)
return (0x7 << 6) | (val << 1);
else if (idx.value & BIR_FAU_UNIFORM)
return (0x2 << 7) | (val << 1);
else
return (0xf << 5) | (va_pack_fau_special(I, idx.value) << 1);
}
static unsigned
va_pack_src(const bi_instr *I, unsigned s)
{
@ -142,6 +166,33 @@ va_pack_src(const bi_instr *I, unsigned s)
invalid_instruction(I, "type of source %u", s);
}
static uint64_t
va_pack_src_v15(const bi_instr *I, unsigned s, unsigned loc)
{
bi_index idx = I->src[s];
uint64_t hex = 0;
uint64_t regval = 0;
if (idx.type == BI_INDEX_REGISTER) {
regval = va_pack_reg_v15(I, idx);
if (idx.discard)
regval |= (1 << 7);
} else if (idx.type == BI_INDEX_FAU) {
pack_assert(I, idx.offset <= 1);
regval = va_pack_fau_64_v15(I, idx) | idx.offset;
} else
invalid_instruction(I, "type of source %u", s);
uint64_t low8 = regval & 0xff;
uint64_t high1 = (regval >> 8) & 0x1;
hex |= (low8 << (8 * loc));
hex |= (high1 << (48 + loc));
return hex;
}
static unsigned
va_pack_wrmask(const bi_instr *I)
{
@ -211,6 +262,20 @@ va_pack_dest(const bi_instr *I)
return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6);
}
static unsigned
va_pack_dest_v15(const bi_instr *I)
{
assert(I->nr_dests);
switch (I->op) {
case BI_OPCODE_SHADDX_S64:
case BI_OPCODE_SHADDX_U64:
/* 64 bit dest has a 0x0 wrmask */
return va_pack_reg_v15(I, I->dest[0]);
default:
return va_pack_reg_v15(I, I->dest[0]) | (va_pack_wrmask(I) << 13);
}
}
static enum va_widen
va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz)
{
@ -454,10 +519,22 @@ va_pack_rhadd(const bi_instr *I)
}
}
static uint64_t
va_pack_clamp_special_round_v15(const bi_instr *I)
{
pack_assert(I, I->special < 4);
if (I->special == BI_SPECIAL_N && I->round == BI_ROUND_RTZ)
return 0x4;
else if (I->special)
return 0x4 | I->special;
else
return I->clamp;
}
static uint64_t
va_pack_alu(const bi_instr *I, unsigned arch)
{
struct va_opcode_info info = valhall_opcodes[I->op];
struct va_opcode_info info = get_valhall_opcode(I->op, arch);
uint64_t hex = 0;
switch (I->op) {
@ -467,25 +544,25 @@ va_pack_alu(const bi_instr *I, unsigned arch)
case BI_OPCODE_FREXPM_F32:
case BI_OPCODE_FREXPM_V2F16:
if (I->sqrt)
hex |= 1ull << 24;
hex |= 1ull << ((arch >= 15) ? 30 : 24);
if (I->log)
hex |= 1ull << 25;
hex |= 1ull << ((arch >= 15) ? 31 : 25);
break;
case BI_OPCODE_FLUSH_F32:
case BI_OPCODE_FLUSH_V2F16:
hex |= I->nan_mode << 8;
hex |= I->nan_mode << ((arch >= 15) ? 30 : 8);
if (I->ftz)
hex |= 1ull << 10;
hex |= 1ull << ((arch >= 15) ? 32 : 10);
if (I->flush_inf)
hex |= 1ull << 11;
hex |= 1ull << ((arch >= 15) ? 33 : 11);
break;
/* Add mux type */
case BI_OPCODE_MUX_I32:
case BI_OPCODE_MUX_V2I16:
case BI_OPCODE_MUX_V4I8:
hex |= (uint64_t)I->mux << 32;
hex |= (uint64_t)I->mux << ((arch >= 15) ? 34 : 32);
break;
/* Add .eq flag */
@ -497,7 +574,7 @@ va_pack_alu(const bi_instr *I, unsigned arch)
hex |= (1ull << 36);
if (I->op == BI_OPCODE_BRANCHZI)
hex |= (0x1ull << 40); /* Absolute */
hex |= (0x1ull << ((arch >= 15) ? 31 : 40)); /* Absolute */
else
hex |= ((uint64_t)I->branch_offset & BITFIELD_MASK(27)) << 8;
@ -513,7 +590,46 @@ va_pack_alu(const bi_instr *I, unsigned arch)
case BI_OPCODE_RSHIFT_XOR_I32:
case BI_OPCODE_RSHIFT_XOR_V2I16:
case BI_OPCODE_RSHIFT_XOR_V4I8:
hex |= (uint64_t)I->arithmetic << 34;
if (arch >= 15) {
/* Rewrite exact to ARSHIFT */
if (I->arithmetic) {
switch (I->op) {
case BI_OPCODE_RSHIFT_AND_I32:
case BI_OPCODE_RSHIFT_AND_V2I16:
case BI_OPCODE_RSHIFT_AND_V4I8: {
uint64_t arshift_and_op = (0xcULL << 30);
/* Check that we can safely overwrite opcode */
pack_assert(I, ((info.exact & (0xfULL << 30)) |
arshift_and_op) == arshift_and_op);
hex |= arshift_and_op;
break;
}
case BI_OPCODE_RSHIFT_OR_I32:
case BI_OPCODE_RSHIFT_OR_V2I16:
case BI_OPCODE_RSHIFT_OR_V4I8: {
uint64_t arshift_or_op = (0xdULL << 30);
/* Check that we can safely overwrite opcode */
pack_assert(I, ((info.exact & (0xfULL << 30)) | arshift_or_op) ==
arshift_or_op);
hex |= arshift_or_op;
break;
}
case BI_OPCODE_RSHIFT_XOR_I32:
case BI_OPCODE_RSHIFT_XOR_V2I16:
case BI_OPCODE_RSHIFT_XOR_V4I8: {
uint64_t arshift_xor_op = (0xbULL << 30);
/* Check that we can safely overwrite opcode */
pack_assert(I, ((info.exact & (0xfULL << 30)) |
arshift_xor_op) == arshift_xor_op);
hex |= arshift_xor_op;
break;
}
default:
UNREACHABLE("RSHIFT->ARSHIFT");
}
}
} else
hex |= (uint64_t)I->arithmetic << 34;
break;
case BI_OPCODE_LEA_BUF_IMM:
@ -564,8 +680,12 @@ va_pack_alu(const bi_instr *I, unsigned arch)
}
hex |= ((uint64_t)va_pack_source_format(I)) << 24;
hex |= ((uint64_t)I->update) << 36;
hex |= ((uint64_t)I->sample) << 38;
hex |= ((uint64_t)I->update) << ((arch >= 15) ? 35 : 36);
hex |= ((uint64_t)I->sample) << ((arch >= 15) ? 37 : 38);
break;
case BI_OPCODE_LD_VAR_BUF_FLAT_IMM:
hex |= ((uint64_t)I->index) << 8;
break;
case BI_OPCODE_LD_ATTR_IMM:
@ -599,20 +719,18 @@ va_pack_alu(const bi_instr *I, unsigned arch)
break;
}
/* FMA_RSCALE.f32 special modes treated as extra opcodes */
if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
pack_assert(I, I->special < 4);
hex |= ((uint64_t)I->special) << 48;
}
/* Add the normal destination or a placeholder. Staging destinations are
* added elsewhere, as they require special handling for control fields.
*/
if (info.has_dest && info.nr_staging_dests == 0) {
hex |= (uint64_t)va_pack_dest(I) << 40;
if (arch >= 15)
hex |= (uint64_t)va_pack_dest_v15(I) << 40;
else
hex |= (uint64_t)va_pack_dest(I) << 40;
} else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) {
pack_assert(I, I->nr_dests == 0);
hex |= 0xC0ull << 40; /* Placeholder */
if (arch < 15)
hex |= 0xC0ull << 40; /* Placeholder */
}
bool swap12 = va_swap_12(I->op);
@ -627,7 +745,10 @@ va_pack_alu(const bi_instr *I, unsigned arch)
enum va_size size = src_info.size;
bi_index src = I->src[logical_i + src_offset];
hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);
if (arch >= 15)
hex |= va_pack_src_v15(I, logical_i + src_offset, i);
else
hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);
if (src_info.notted) {
if (src.neg)
@ -636,10 +757,15 @@ va_pack_alu(const bi_instr *I, unsigned arch)
unsigned neg_offs = 32 + 2 + ((2 - i) * 2);
unsigned abs_offs = 33 + 2 + ((2 - i) * 2);
if (src.neg)
hex |= 1ull << neg_offs;
if (src.abs)
hex |= 1ull << abs_offs;
if (arch >= 15 && I->op == BI_OPCODE_FMA_RSCALE_F32 && i == 2) {
if (src.neg)
hex |= 1ull << (neg_offs + 1);
} else {
if (src.neg)
hex |= 1ull << neg_offs;
if (src.abs)
hex |= 1ull << abs_offs;
}
} else {
if (src.neg)
invalid_instruction(I, "negate");
@ -659,8 +785,8 @@ va_pack_alu(const bi_instr *I, unsigned arch)
unsigned offs = (i == 1) ? 26 : 36;
hex |= (uint64_t)va_pack_widen(I, src.swizzle, src_info.size) << offs;
} else if (src_info.lane) {
unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ?
((i == 0) ? 38 : 36) : ((i == 0) ? 28 : 26);
unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ? ((i == 0) ? 38 : 36)
: ((i == 0) ? 28 : 26);
if (src_info.size == VA_SIZE_16) {
hex |= (src.swizzle == BI_SWIZZLE_H1 ? 1 : 0) << offs;
@ -673,7 +799,25 @@ va_pack_alu(const bi_instr *I, unsigned arch)
} else if (src_info.lanes) {
pack_assert(I, src_info.size == VA_SIZE_8);
pack_assert(I, i == 1);
hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
if (arch >= 15 && I->op == BI_OPCODE_CLPER_I32) {
switch (src.swizzle) {
case BI_SWIZZLE_B00:
hex |= 0x0ULL << 28;
break;
case BI_SWIZZLE_B11:
hex |= 0x1ULL << 28;
break;
case BI_SWIZZLE_B22:
hex |= 0x2ULL << 28;
break;
case BI_SWIZZLE_B33:
hex |= 0x3ULL << 28;
break;
default:
invalid_instruction(I, "lane shift");
}
} else
hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
} else if (src_info.combine) {
/* Treat as swizzle, subgroup ops not yet supported */
pack_assert(I, src_info.size == VA_SIZE_32);
@ -689,17 +833,33 @@ va_pack_alu(const bi_instr *I, unsigned arch)
}
if (info.saturate)
hex |= (uint64_t)I->saturate << 30;
if (info.rhadd)
hex |= (uint64_t)I->saturate << ((arch >= 15) ? 25 : 30);
if (info.rhadd) {
pack_assert(I, arch < 15);
hex |= va_pack_rhadd(I);
if (info.clamp)
hex |= (uint64_t)I->clamp << 32;
if (info.round_mode)
hex |= (uint64_t)I->round << 30;
}
/* FMA_RSCALE.f32 special modes treated as extra opcodes */
if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
if (arch >= 15) {
hex |= va_pack_clamp_special_round_v15(I) << 32;
} else {
pack_assert(I, I->special < 4);
hex |= ((uint64_t)I->special) << 48;
if (info.clamp)
hex |= (uint64_t)I->clamp << 32;
if (info.round_mode && I->round == BI_ROUND_RTZ)
hex |= (uint64_t)0x1 << 50;
}
} else {
if (info.clamp)
hex |= (uint64_t)I->clamp << ((arch >= 15) ? 30 : 32);
if (info.round_mode)
hex |= (uint64_t)I->round << ((arch >= 15) ? 32 : 30);
}
if (info.condition)
hex |= (uint64_t)I->cmpf << 32;
hex |= (uint64_t)I->cmpf << ((arch >= 15) ? 33 : 32);
if (info.result_type)
hex |= (uint64_t)I->result_type << 30;
hex |= (uint64_t)I->result_type << ((arch >= 15) ? 24 : 30);
return hex;
}
@ -748,7 +908,8 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor)
VA_LOAD_LANE_96_BIT_IDENTITY, VA_LOAD_LANE_128_BIT_IDENTITY,
};
unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7;
/* TODO hack */
unsigned memory_size = (get_valhall_opcode(I->op, 10).exact >> 27) & 0x7;
uint64_t hex = (uint64_t)load_lane_identity[memory_size] << 36;
// unsigned
@ -765,6 +926,26 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor)
return hex;
}
static uint64_t
va_pack_load_v15(const bi_instr *I, bool buffer_descriptor)
{
/* This implicitly means identity: VA_LOAD_LANE_8_BIT_B0 for i8 (bits[28;27])
* and VA_LOAD_LANE_16_BIT_H0 for i16 (bit[27]) */
uint64_t hex = 0;
if (!buffer_descriptor)
hex |= va_pack_byte_offset(I);
hex |= va_pack_src_v15(I, 0, 0);
hex |= (uint64_t)I->mem_access << 24;
if (buffer_descriptor)
hex |= va_pack_src_v15(I, 1, 1);
return hex;
}
static uint64_t
va_pack_store(const bi_instr *I)
{
@ -779,6 +960,20 @@ va_pack_store(const bi_instr *I)
return hex;
}
static uint64_t
va_pack_store_v15(const bi_instr *I)
{
uint64_t hex = 0;
va_validate_register_pair(I, 1);
hex |= va_pack_src_v15(I, 1, 0);
hex |= I->mem_access << 24;
hex |= va_pack_byte_offset(I);
return hex;
}
static enum va_lod_mode
va_pack_lod_mode(const bi_instr *I)
{
@ -798,27 +993,6 @@ va_pack_lod_mode(const bi_instr *I)
invalid_instruction(I, "LOD mode");
}
static enum va_register_type
va_pack_register_type(const bi_instr *I)
{
switch (I->register_format) {
case BI_REGISTER_FORMAT_F16:
case BI_REGISTER_FORMAT_F32:
return VA_REGISTER_TYPE_F;
case BI_REGISTER_FORMAT_U16:
case BI_REGISTER_FORMAT_U32:
return VA_REGISTER_TYPE_U;
case BI_REGISTER_FORMAT_S16:
case BI_REGISTER_FORMAT_S32:
return VA_REGISTER_TYPE_S;
default:
invalid_instruction(I, "register type");
}
}
static enum va_register_format
va_pack_register_format(const bi_instr *I)
{
@ -842,13 +1016,45 @@ va_pack_register_format(const bi_instr *I)
}
}
static uint64_t
va_pack_src_null_v15(unsigned loc)
{
uint64_t hex = 0;
uint64_t regval = 0x1c0;
uint64_t low8 = regval & 0xff;
uint64_t high1 = (regval >> 8) & 0x1;
hex |= (low8 << (8 * loc));
hex |= (high1 << (48 + loc));
return hex;
}
static unsigned
va_repack_sr_control_v15(unsigned sr_control)
{
unsigned repacked = 0;
bool read = sr_control & 0x1;
bool write = sr_control & 0x2;
if (read) {
repacked |= 0x2;
if (write)
repacked |= 0x1;
}
return repacked;
}
uint64_t
va_pack_instr(const bi_instr *I, unsigned arch)
{
struct va_opcode_info info = valhall_opcodes[I->op];
struct va_opcode_info info = get_valhall_opcode(I->op, arch);
uint64_t hex = info.exact | (((uint64_t)I->flow) << 59);
hex |= ((uint64_t)va_select_fau_page(I)) << 57;
uint64_t hex =
info.exact | (((uint64_t)I->flow) << ((arch >= 15) ? 58 : 59));
hex |= ((uint64_t)va_select_fau_page(I, arch)) << ((arch >= 15) ? 62 : 57);
if (info.slot)
hex |= ((uint64_t)I->slot << 30);
@ -860,14 +1066,60 @@ va_pack_instr(const bi_instr *I, unsigned arch)
unsigned count =
read ? bi_count_read_registers(I, 0) : bi_count_write_registers(I, 0);
hex |= ((uint64_t)count << 33);
hex |= (uint64_t)va_pack_reg(I, sr) << 40;
hex |= ((uint64_t)info.sr_control << 46);
hex |= ((uint64_t)count << ((arch >= 15) ? 32 : 33));
if (arch >= 15) {
hex |= (uint64_t)va_pack_reg_v15(I, sr) << 40;
hex |= ((uint64_t)va_repack_sr_control_v15(info.sr_control) << 38);
} else {
hex |= (uint64_t)va_pack_reg(I, sr) << 40;
hex |= ((uint64_t)info.sr_control << 46);
}
}
/* On v15, some instructions require special sr_control values */
if (arch >= 15) {
switch (I->op) {
case BI_OPCODE_BARRIER: {
unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
pack_assert(I, sr_control == 0x0 || sr_control == 0x2);
hex |= (uint64_t)0x2 << 38;
break;
}
case BI_OPCODE_ATOM1_RETURN_I32:
case BI_OPCODE_ATOM1_RETURN_I64: {
unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
pack_assert(I, sr_control == 0x0);
break;
}
case BI_OPCODE_ATOM_I32:
case BI_OPCODE_ATOM_I64: {
unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
pack_assert(I, sr_control == 0x2);
break;
}
case BI_OPCODE_ATOM_RETURN_I32:
case BI_OPCODE_ATOM_RETURN_I64:
case BI_OPCODE_AXCHG_I32:
case BI_OPCODE_AXCHG_I64:
case BI_OPCODE_ACMPXCHG_I32:
case BI_OPCODE_ACMPXCHG_I64: {
unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
pack_assert(I, sr_control == 0x0 || sr_control == 0x3);
hex |= (uint64_t)0x3 << 38;
break;
}
default:
break;
}
}
if (info.sr_write_count) {
hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) << 36;
hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1)
<< ((arch >= 15) ? 35 : 36);
if (arch >= 15)
hex |= ((uint64_t)va_pack_reg_v15(I, I->dest[0])) << 16;
else
hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
}
if (info.vecsize)
@ -885,7 +1137,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
case BI_OPCODE_LOAD_I64:
case BI_OPCODE_LOAD_I96:
case BI_OPCODE_LOAD_I128:
hex |= va_pack_load(I, false);
if (arch >= 15)
hex |= va_pack_load_v15(I, false);
else
hex |= va_pack_load(I, false);
break;
case BI_OPCODE_LD_PKA_I8:
@ -896,7 +1151,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
case BI_OPCODE_LD_PKA_I64:
case BI_OPCODE_LD_PKA_I96:
case BI_OPCODE_LD_PKA_I128:
hex |= va_pack_load(I, true);
if (arch >= 15)
hex |= va_pack_load_v15(I, true);
else
hex |= va_pack_load(I, true);
break;
case BI_OPCODE_STORE_I8:
@ -907,20 +1165,26 @@ va_pack_instr(const bi_instr *I, unsigned arch)
case BI_OPCODE_STORE_I64:
case BI_OPCODE_STORE_I96:
case BI_OPCODE_STORE_I128:
hex |= va_pack_store(I);
if (arch >= 15)
hex |= va_pack_store_v15(I);
else
hex |= va_pack_store(I);
break;
case BI_OPCODE_ATOM1_RETURN_I64:
/* Permit omitting the destination for plain ATOM1 */
if (!bi_count_write_registers(I, 0)) {
if (arch < 15 && !bi_count_write_registers(I, 0)) {
hex |= (0x40ull << 40); // fake read
}
/* 64-bit source */
va_validate_register_pair(I, 0);
hex |= (uint64_t)va_pack_src(I, 0) << 0;
if (arch >= 15)
hex |= va_pack_src_v15(I, 0, 0);
else
hex |= (uint64_t)va_pack_src(I, 0) << 0;
hex |= va_pack_byte_offset_8(I);
hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
hex |= ((uint64_t)va_pack_atom_opc_1(I)) << ((arch >= 15) ? 24 : 22);
break;
case BI_OPCODE_ACMPXCHG_I64:
@ -929,29 +1193,43 @@ va_pack_instr(const bi_instr *I, unsigned arch)
case BI_OPCODE_ATOM_RETURN_I64:
/* 64-bit source */
va_validate_register_pair(I, 1);
hex |= (uint64_t)va_pack_src(I, 1) << 0;
if (arch >= 15)
hex |= va_pack_src_v15(I, 1, 0);
else
hex |= (uint64_t)va_pack_src(I, 1) << 0;
hex |= va_pack_byte_offset_8(I);
hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
hex |= ((uint64_t)va_pack_atom_opc(I)) << ((arch >= 15) ? 24 : 22);
if (I->op == BI_OPCODE_ATOM_RETURN_I64)
hex |= (0xc0ull << 40); // flags
if (arch >= 15) {
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) {
/* Change bits [51;50] to be ACMPXCHG */
pack_assert(I, ((hex >> 50) & 0b11) == 0b01);
hex ^= (0b11ull << 50);
}
} else {
if (I->op == BI_OPCODE_ATOM_RETURN_I64)
hex |= (0xc0ull << 40); // flags
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
hex |= (1 << 26); /* .compare */
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
hex |= (1 << 26); /* .compare */
}
break;
case BI_OPCODE_ATOM1_RETURN_I32:
/* Permit omitting the destination for plain ATOM1 */
if (!bi_count_write_registers(I, 0)) {
if (arch < 15 && !bi_count_write_registers(I, 0)) {
hex |= (0x40ull << 40); // fake read
}
/* 64-bit source */
va_validate_register_pair(I, 0);
hex |= (uint64_t)va_pack_src(I, 0) << 0;
if (arch >= 15)
hex |= va_pack_src_v15(I, 0, 0);
else
hex |= (uint64_t)va_pack_src(I, 0) << 0;
hex |= va_pack_byte_offset_8(I);
hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
hex |= ((uint64_t)va_pack_atom_opc_1(I)) << ((arch >= 15) ? 24 : 22);
break;
case BI_OPCODE_ACMPXCHG_I32:
@ -960,41 +1238,67 @@ va_pack_instr(const bi_instr *I, unsigned arch)
case BI_OPCODE_ATOM_RETURN_I32:
/* 64-bit source */
va_validate_register_pair(I, 1);
hex |= (uint64_t)va_pack_src(I, 1) << 0;
if (arch >= 15)
hex |= va_pack_src_v15(I, 1, 0);
else
hex |= (uint64_t)va_pack_src(I, 1) << 0;
hex |= va_pack_byte_offset_8(I);
hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
hex |= ((uint64_t)va_pack_atom_opc(I)) << ((arch >= 15) ? 24 : 22);
if (I->op == BI_OPCODE_ATOM_RETURN_I32)
hex |= (0xc0ull << 40); // flags
if (arch >= 15) {
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) {
/* Change bits [51;50] to be ACMPXCHG */
pack_assert(I, ((hex >> 50) & 0b11) == 0b01);
hex ^= (0b11ull << 50);
}
} else {
if (I->op == BI_OPCODE_ATOM_RETURN_I32)
hex |= (0xc0ull << 40); // flags
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
hex |= (1 << 26); /* .compare */
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
hex |= (1 << 26); /* .compare */
}
break;
case BI_OPCODE_LD_CVT:
hex |= (uint64_t)va_pack_src(I, 0);
if (arch >= 15)
hex |= va_pack_src_v15(I, 0, 0);
else
hex |= (uint64_t)va_pack_src(I, 0);
hex |= va_pack_byte_offset(I);
/* Conversion descriptor */
hex |= (uint64_t)va_pack_src(I, 2) << 16;
hex |= (uint64_t)I->mem_access << 37;
if (arch >= 15)
hex |= va_pack_src_v15(I, 2, 2);
else
hex |= (uint64_t)va_pack_src(I, 2) << 16;
hex |= (uint64_t)I->mem_access << ((arch >= 15) ? 35 : 37);
break;
case BI_OPCODE_ST_CVT:
/* Staging read */
va_validate_register_pair(I, 1);
hex |= (uint64_t)va_pack_src(I, 1) << 0;
if (arch >= 15)
hex |= va_pack_src_v15(I, 1, 0);
else
hex |= (uint64_t)va_pack_src(I, 1) << 0;
hex |= va_pack_byte_offset(I);
/* Conversion descriptor */
hex |= (uint64_t)va_pack_src(I, 3) << 16;
hex |= (uint64_t)I->mem_access << 37;
if (arch >= 15)
hex |= va_pack_src_v15(I, 3, 2);
else
hex |= (uint64_t)va_pack_src(I, 3) << 16;
hex |= (uint64_t)I->mem_access << ((arch >= 15) ? 35 : 37);
break;
case BI_OPCODE_BLEND: {
/* Source 0 - Blend descriptor (64-bit) */
hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
if (arch >= 15)
hex |= va_pack_src_v15(I, 2, 0);
else
hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
va_validate_register_pair(I, 2);
/* Target */
@ -1005,7 +1309,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
hex |= ((I->branch_offset >> 3) << 8);
/* Source 2 - coverage mask */
hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;
if (arch >= 15)
hex |= va_pack_src_v15(I, 1, 2);
else
hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;
/* Vector size */
unsigned vecsize = 4;
@ -1015,7 +1322,7 @@ va_pack_instr(const bi_instr *I, unsigned arch)
}
case BI_OPCODE_LD_GCLK_U64:
hex |= va_pack_gclk(I);
hex |= va_pack_gclk(I) << ((arch >= 15) ? 8 : 0);
break;
case BI_OPCODE_TEX_GRADIENT:
@ -1023,7 +1330,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
case BI_OPCODE_TEX_FETCH:
case BI_OPCODE_TEX_GATHER: {
/* Image to read from */
hex |= ((uint64_t)va_pack_src(I, 1)) << 0;
if (arch >= 15)
hex |= va_pack_src_v15(I, 1, 0);
else
hex |= ((uint64_t)va_pack_src(I, 1)) << 0;
if ((I->op == BI_OPCODE_TEX_FETCH || I->op == BI_OPCODE_TEX_GRADIENT) &&
I->shadow)
@ -1040,7 +1350,7 @@ va_pack_instr(const bi_instr *I, unsigned arch)
if (I->skip)
hex |= (1ull << 39);
if (!bi_is_regfmt_16(I->register_format))
hex |= (1ull << 46);
hex |= (1ull << ((arch >= 15) ? 38 : 46));
if (I->op == BI_OPCODE_TEX_GRADIENT) {
if (I->force_delta_enable)
@ -1062,20 +1372,35 @@ va_pack_instr(const bi_instr *I, unsigned arch)
hex |= ((uint64_t)I->fetch_component) << 14;
}
hex |= (I->write_mask << 22);
hex |= (I->write_mask << ((arch >= 15) ? 24 : 22));
hex |= ((uint64_t)I->dimension) << 28;
break;
}
default:
if (!info.exact && I->op != BI_OPCODE_NOP)
if (!info.exact && (arch >= 15 || I->op != BI_OPCODE_NOP))
invalid_instruction(I, "opcode");
hex |= va_pack_alu(I, arch);
break;
}
/* On v15, some instrutions require an encoded null src. */
if (arch >= 15) {
switch (I->op) {
case BI_OPCODE_NOP:
case BI_OPCODE_LD_VAR_FLAT_IMM:
case BI_OPCODE_LD_VAR_BUF_FLAT_IMM:
case BI_OPCODE_LD_GCLK_U64:
case BI_OPCODE_BARRIER:
hex |= va_pack_src_null_v15(0);
break;
default:
break;
}
}
return hex;
}

View file

@ -9,7 +9,7 @@
#include "valhall.h"
void
va_count_instr_stats(bi_instr *I, struct va_stats *stats)
va_count_instr_stats(bi_instr *I, unsigned arch, struct va_stats *stats)
{
/* Adjusted for 64-bit arithmetic */
unsigned words = bi_count_write_registers(I, 0);
@ -35,7 +35,7 @@ va_count_instr_stats(bi_instr *I, struct va_stats *stats)
}
}
}
switch (valhall_opcodes[I->op].unit) {
switch (get_valhall_opcode(I->op, arch).unit) {
/* Arithmetic is 2x slower for 64-bit than 32-bit */
case VA_UNIT_FMA:
stats->fma += words;

View file

@ -93,7 +93,8 @@ fau_state_uniform(struct fau_state *fau, bi_index idx, enum bi_opcode op)
}
static bool
fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op)
fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op,
unsigned arch)
{
for (unsigned i = 0; i < ARRAY_SIZE(fau->buffer); ++i) {
bi_index buf = fau->buffer[i];
@ -106,7 +107,7 @@ fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op)
/* Instructions executed by the messaging unit should not encode WARP_ID or
* anything from special page 3. */
if (can_run_on_message_unit(op) &&
(va_fau_page(idx.value) == 3 || idx.value == BIR_FAU_WARP_ID))
(va_fau_page(idx.value, arch) == 3 || idx.value == BIR_FAU_WARP_ID))
return false;
return fau->uniform_slot == -1 || can_use_two_fau_indices(op);
@ -114,7 +115,7 @@ fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op)
static bool
valid_src(struct fau_state *fau, unsigned fau_page, bi_index src,
enum bi_opcode op)
enum bi_opcode op, unsigned arch)
{
if (src.type != BI_INDEX_FAU)
return true;
@ -128,42 +129,42 @@ valid_src(struct fau_state *fau, unsigned fau_page, bi_index src,
return fau_state_buffer(fau, src);
}
bool valid = (fau_page == va_fau_page(src.value));
bool valid = (fau_page == va_fau_page(src.value, arch));
valid &= fau_state_buffer(fau, src);
if (src.value & BIR_FAU_UNIFORM)
valid &= fau_state_uniform(fau, src, op);
else if (fau_is_special(src.value))
valid &= fau_state_special(fau, src, op);
valid &= fau_state_special(fau, src, op, arch);
return valid;
}
bool
va_validate_fau(bi_instr *I)
va_validate_fau(bi_instr *I, unsigned arch)
{
bool valid = true;
struct fau_state fau = {.uniform_slot = -1};
unsigned fau_page = va_select_fau_page(I);
unsigned fau_page = va_select_fau_page(I, arch);
bi_foreach_src(I, s) {
valid &= valid_src(&fau, fau_page, I->src[s], I->op);
valid &= valid_src(&fau, fau_page, I->src[s], I->op, arch);
}
return valid;
}
void
va_repair_fau(bi_builder *b, bi_instr *I)
va_repair_fau(bi_builder *b, bi_instr *I, unsigned arch)
{
struct fau_state fau = {.uniform_slot = -1};
unsigned fau_page = va_select_fau_page(I);
unsigned fau_page = va_select_fau_page(I, arch);
bi_foreach_src(I, s) {
struct fau_state push = fau;
bi_index src = I->src[s];
if (!valid_src(&fau, fau_page, src, I->op)) {
if (!valid_src(&fau, fau_page, src, I->op, arch)) {
bi_replace_src(I, s, bi_mov_i32(b, bi_strip_index(src)));
/* Rollback update. Since the replacement move doesn't affect FAU
@ -180,7 +181,7 @@ va_validate(FILE *fp, bi_context *ctx)
bool errors = false;
bi_foreach_instr_global(ctx, I) {
if (!va_validate_fau(I)) {
if (!va_validate_fau(I, ctx->arch)) {
if (!errors) {
fprintf(fp, "Validation failed, this is a bug. Shader:\n\n");
bi_print_shader(ctx, fp);

View file

@ -97,10 +97,10 @@ valhall_opcodes[BI_NUM_OPCODES] = {
sr_control = 0
if len(op.staging) > 0:
sr_control = op.staging[0].encoded_flags >> 6
sr_control = op.staging[0].encoded_flags
%>
[BI_OPCODE_${name.replace('.', '_').upper()}] = {
.exact = ${hex(exact(op))}ULL,
.exact = ${hex(exact(op.opcode))}ULL,
.srcs = {
% for src in ([sr for sr in op.staging if sr.read] + op.srcs):
{
@ -141,12 +141,84 @@ valhall_opcodes[BI_NUM_OPCODES] = {
% endif
% endfor
};
const struct va_opcode_info
valhall_v15_opcodes[BI_NUM_OPCODES] = {
% for op in instructions:
% if op.name not in skip:
<%
name = op.name
if name == 'BRANCHZ':
name = 'BRANCHZ.i16'
sr_control = 0
if len(op.staging) > 0:
sr_control = op.staging[0].encoded_flags
%>
[BI_OPCODE_${name.replace('.', '_').upper()}] = {
.exact = ${hex(exact(op.opcode_v15))}ULL,
.srcs = {
% for src in ([sr for sr in op.staging if sr.read] + op.srcs):
{
.absneg = ${ibool(src.absneg)},
.swizzle = ${ibool(src.swizzle)},
.notted = ${ibool(src.notted)},
.widen = ${ibool(src.widen)},
.lanes = ${ibool(src.lanes)},
.halfswizzle = ${ibool(src.halfswizzle)},
.lane = ${ibool(src.lane)},
.combine = ${ibool(src.combine)},
% if src.size in [8, 16, 32, 64]:
.size = VA_SIZE_${src.size},
% endif
},
% endfor
},
.type_size = ${typesize(op.name)},
.has_dest = ${ibool(len(op.dests) > 0)},
.is_signed = ${ibool(op.is_signed)},
.unit = VA_UNIT_${op.unit},
.nr_srcs = ${len(op.srcs)},
.nr_staging_srcs = ${sum([sr.read for sr in op.staging])},
.nr_staging_dests = ${sum([sr.write for sr in op.staging])},
.clamp = ${hasmod(x, 'clamp')},
.saturate = ${hasmod(x, 'saturate')},
.rhadd = ${hasmod(x, 'rhadd')},
.round_mode = ${hasmod(x, 'round_mode')},
.condition = ${hasmod(x, 'condition')},
.result_type = ${hasmod(x, 'result_type')},
.vecsize = ${hasmod(x, 'vector_size')},
.register_format = ${hasmod(x, 'register_format')},
.slot = ${hasmod(x, 'slot')},
.sr_count = ${hasmod(x, 'staging_register_count')},
.sr_write_count = ${hasmod(x, 'staging_register_write_count')},
.sr_control = ${sr_control},
},
% endif
% endfor
};
const struct va_opcode_info
get_valhall_opcode(enum bi_opcode op, unsigned arch)
{
assert(arch >= 9);
if (arch < 15)
return valhall_opcodes[op];
else
return valhall_v15_opcodes[op];
}
"""
# Exact value to be ORed in to every opcode
def exact_op(op):
def exact_op(opcode):
exact_op = 0
for subcode in op.opcode:
# Need an early return in case of removed instructions
if not opcode:
return exact_op
for subcode in opcode:
exact_op |= (subcode.value << subcode.start)
return exact_op

View file

@ -89,7 +89,8 @@ struct va_opcode_info {
unsigned sr_control : 2;
};
extern const struct va_opcode_info valhall_opcodes[BI_NUM_OPCODES];
const struct va_opcode_info get_valhall_opcode(enum bi_opcode op,
unsigned arch);
/* Bifrost specifies the source of bitwise operations as (A, B, shift), but
* Valhall specifies (A, shift, B). We follow Bifrost conventions in the
@ -130,10 +131,10 @@ va_swap_12(enum bi_opcode op)
}
static inline struct va_src_info
va_src_info(enum bi_opcode op, unsigned src)
va_src_info(enum bi_opcode op, unsigned src, unsigned arch)
{
unsigned idx = (va_swap_12(op) && (src == 1 || src == 2)) ? (3 - src) : src;
return valhall_opcodes[op].srcs[idx];
return get_valhall_opcode(op, arch).srcs[idx];
}
static inline bool

View file

@ -14,6 +14,7 @@ import sys
instructions = []
MODIFIERS = {}
MODIFIERS_V15 = {}
enums = {}
immediates = []
@ -102,6 +103,11 @@ class Source:
self.offset['value'] = self.start
self.mask['value'] = bitmask(6)
self.offset['high1_v15'] = (index + 48)
self.mask['high1_v15'] = bitmask(1)
self.offset['low8_v15'] = self.start
self.mask['low8_v15'] = bitmask(8)
if absneg:
self.offset['neg'] = 32 + 2 + ((2 - index) * 2)
self.offset['abs'] = 33 + 2 + ((2 - index) * 2)
@ -137,6 +143,11 @@ class Dest:
self.offset['value'] = self.start
self.mask['value'] = bitmask(6)
self.offset['mode_v15'] = self.start + 13
self.mask['mode_v15'] = bitmask(2)
self.offset['value_v15'] = self.start
self.mask['value_v15'] = bitmask(8)
class Staging:
def __init__(self, read = False, write = False, count = 0, flags = 'true', name = ""):
self.name = name
@ -152,6 +163,14 @@ class Staging:
self.offset['value'] = self.start
self.mask['value'] = bitmask(6)
self.offset['flags'] = self.start + 6
self.mask['flags'] = bitmask(2)
self.offset['value_v15'] = self.start
self.mask['value_v15'] = bitmask(8)
self.offset['flags_v15'] = 38
self.mask['flags_v15'] = bitmask(2)
# For compatibility
self.absneg = False
@ -166,11 +185,14 @@ class Staging:
if not self.flags:
self.encoded_flags = 0
self.encoded_flags_v15 = 0
elif flags == 'rw':
self.encoded_flags = 0xc0
self.encoded_flags = 0b11
self.encoded_flags_v15 = 0b11
else:
assert(flags == 'true')
self.encoded_flags = (0x80 if write else 0) | (0x40 if read else 0)
self.encoded_flags = (0b10 if write else 0) | (0b01 if read else 0)
self.encoded_flags_v15 = (0b10 if read else 0) | (0b01 if read and write else 0)
class Immediate:
def __init__(self, name, start, size, signed):
@ -186,13 +208,16 @@ class Opcode:
self.mask = mask
class Instruction:
def __init__(self, name, opcode, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None):
def __init__(self, name, opcode, opcode_v15, srcs = [], dests = [], immediates = [], immediates_v15 = [], modifiers = [], modifiers_v15 = [], staging = None, unit = None):
self.name = name
self.srcs = srcs
self.dests = dests
self.opcode = opcode
self.opcode_v15 = opcode_v15
self.immediates = immediates
self.immediates_v15 = immediates_v15
self.modifiers = modifiers
self.modifiers_v15 = modifiers_v15
self.staging = staging
self.unit = unit
self.is_signed = len(name.split(".")) > 1 and ('s' in name.split(".")[1])
@ -205,6 +230,11 @@ class Instruction:
self.offset['fau_page'] = 57
self.mask['fau_page'] = bitmask(2)
self.offset['flow_v15'] = 58
self.mask['flow_v15'] = bitmask(4)
self.offset['fau_page_v15'] = 62
self.mask['fau_page_v15'] = bitmask(2)
# Message-passing instruction <===> not ALU instruction
self.message = unit not in ["FMA", "CVT", "SFU"]
@ -273,6 +303,7 @@ def build_instr(el, overrides = {}):
# Get overridables
name = overrides.get('name') or el.attrib.get('name')
opcode = overrides.get('opcode') or build_opcode(el, 'opcode')
opcode_v15 = overrides.get('opcode_v15') or build_opcode(el, 'opcode_v15')
unit = overrides.get('unit') or el.attrib.get('unit')
# Get explicit sources/dests
@ -304,15 +335,25 @@ def build_instr(el, overrides = {}):
# Get immediates
imms = [build_imm(imm) for imm in el.findall('imm')]
imms_v15 = [build_imm(imm) for imm in el.findall('imm_v15_override')]
for imm in imms:
if imm.name not in {imm.name for imm in imms_v15}:
imms_v15.append(imm)
modifiers = []
modifiers_v15 = []
for mod in el:
if (mod.tag in MODIFIERS) and not (mod.attrib.get('pseudo', False)):
modifiers.append(MODIFIERS[mod.tag])
modifiers_v15.append(MODIFIERS_V15[mod.tag])
elif mod.tag =='va_mod':
modifiers.append(build_modifier(mod))
elif mod.tag =='va_mod_v15':
modifiers_v15.append(build_modifier(mod))
instr = Instruction(name, opcode, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit)
instr = Instruction(name, opcode, opcode_v15, srcs = sources, dests = dests, immediates = imms, immediates_v15 = imms_v15,
modifiers = modifiers, modifiers_v15 = modifiers_v15, staging = staging, unit = unit)
instructions.append(instr)
@ -323,6 +364,7 @@ def build_group(el):
build_instr(el, overrides = {
'name': ins.attrib['name'],
'opcode': build_opcode(ins, 'opcode'),
'opcode_v15': build_opcode(ins, 'opcode_v15'),
'unit': ins.attrib.get('unit'),
})
@ -377,6 +419,7 @@ def typesize(name):
# Parse the ISA
def valhall_parse_isa(xmlfile):
global MODIFIERS
global MODIFIERS_V15
global enums
global immediates
global root
@ -404,7 +447,6 @@ def valhall_parse_isa(xmlfile):
"lod_bias_disable": Modifier("lod_mode", 13, 1),
"lod_clamp_disable": Modifier("lod_mode", 14, 1),
"write_mask": Modifier("write_mask", 22, 4),
"register_type": Modifier("register_type", 26, 2),
"dimension": Modifier("dimension", 28, 2),
"skip": Flag("skip", 39),
"register_width": Modifier("register_width", 46, 1, force_enum = "register_width"),
@ -438,6 +480,52 @@ def valhall_parse_isa(xmlfile):
"sample": Modifier("sample_mode", 38, 2),
}
MODIFIERS_V15 = {
# Texture instructions share a common encoding
"wide_indices": Flag("wide_indices", 8),
"array_enable": Flag("array_enable", 10),
"texel_offset": Flag("texel_offset", 11),
"shadow": Flag("shadow", 12),
"integer_coordinates": Flag("integer_coordinates", 13),
"fetch_component": Modifier("fetch_component", 14, 2),
"lod_mode": Modifier("lod_mode", 13, 3),
"lod_bias_disable": Modifier("lod_mode", 13, 1),
"lod_clamp_disable": Modifier("lod_mode", 14, 1),
"write_mask": Modifier("write_mask", 24, 4),
"dimension": Modifier("dimension", 28, 2),
"skip": Flag("skip", 39),
"register_width": Modifier("register_width", 38, 1, force_enum = "register_width"),
"secondary_register_width": Modifier("secondary_register_width", 54, 1, force_enum = "register_width"),
"vartex_register_width": Modifier("varying_texture_register_width", 24, 2),
"atom_opc": Modifier("atomic_operation", 24, 4),
"atom_opc_1": Modifier("atomic_operation_with_1", 24, 3),
"inactive_result": Modifier("inactive_result", 22, 4),
"memory_access": Modifier("memory_access", 24, 2),
"regfmt": Modifier("register_format", 24, 3),
"source_format": Modifier("source_format", 24, 2),
"vecsize": Modifier("vector_size", 28, 2),
"slot": Modifier("slot_v15", 30, 2),
"roundmode": Modifier("round_mode", 32, 2),
"result_type": Modifier("result_type", 24, 2),
"saturate": Flag("saturate", 25),
"not_result": Flag("not_result", 34),
"lane_op": Modifier("lane_operation", 32, 4),
"cmp": Modifier("condition", 33, 3),
"clamp": Modifier("clamp", 30, 2),
"sr_count": Modifier("staging_register_count", 32, 3, implied = True),
"sample_and_update": Modifier("sample_and_update_mode", 32, 3),
"sr_write_count": Modifier("staging_register_write_count", 35, 3, implied = True),
"conservative": Flag("conservative", 35),
"subgroup": Modifier("subgroup_size", 36, 4),
"update": Modifier("update_mode", 35, 2),
"sample": Modifier("sample_mode", 37, 2),
}
for child in root:
if child.tag == 'group':
build_group(child)

View file

@ -52,8 +52,10 @@ pan_get_nir_shader_compiler_options(unsigned arch, bool merge_wg)
case 11:
case 12:
case 13:
return merge_wg ? &bifrost_nir_options_v11_merge_wg :
&bifrost_nir_options_v11;
case 14:
case 15:
return merge_wg ? &bifrost_nir_options_v11_merge_wg
: &bifrost_nir_options_v11;
default:
assert(!"Unsupported arch");
return NULL;
@ -285,7 +287,8 @@ pan_disassemble(FILE *fp, const void *code, size_t size, uint64_t gpu_id,
bool verbose)
{
if (pan_arch(gpu_id) >= 9)
disassemble_valhall(fp, (const uint64_t *)code, size, verbose);
disassemble_valhall(fp, (const uint64_t *)code, size, pan_arch(gpu_id),
verbose);
else if (pan_arch(gpu_id) >= 6)
disassemble_bifrost(fp, code, size, verbose);
else

View file

@ -824,7 +824,11 @@ cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
case MALI_CS_OPCODE_STORE_MULTIPLE:
case MALI_CS_OPCODE_RUN_COMPUTE:
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2:
#else
case MALI_CS_OPCODE_RUN_FRAGMENT:
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN:
#if PAN_ARCH >= 12
case MALI_CS_OPCODE_RUN_IDVS2:
@ -1614,6 +1618,22 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
}
#endif
#if PAN_ARCH >= 14
static inline void
cs_run_fragment2(struct cs_builder *b, bool enable_tem,
enum mali_tile_render_order tile_order)
{
/* Staging regs */
cs_flush_loads(b);
b->req_resource_mask |= CS_FRAG_RES;
cs_emit(b, RUN_FRAGMENT2, I) {
I.enable_tem = enable_tem;
I.tile_order = tile_order;
}
}
#else
static inline void
cs_run_fragment(struct cs_builder *b, bool enable_tem,
enum mali_tile_render_order tile_order)
@ -1628,6 +1648,7 @@ cs_run_fragment(struct cs_builder *b, bool enable_tem,
I.tile_order = tile_order;
}
}
#endif
static inline void
cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
@ -2469,6 +2490,53 @@ cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
(int16_t)(offsetof(struct cs_##__type##_trace, __field) - \
sizeof(struct cs_##__type##_trace))
#if PAN_ARCH >= 14
#define CS_RUN_FRAGMENT2_SR_COUNT 56
#define CS_RUN_FRAGMENT2_SR_MASK BITFIELD64_RANGE(0, CS_RUN_FRAGMENT2_SR_COUNT)
struct cs_run_fragment2_trace {
uint64_t ip;
uint32_t sr[CS_RUN_FRAGMENT2_SR_COUNT];
} __attribute__((aligned(64)));
static inline void
cs_trace_run_fragment2(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
struct cs_index scratch_regs, bool enable_tem,
enum mali_tile_render_order tile_order)
{
if (likely(!ctx->enabled)) {
cs_run_fragment2(b, enable_tem, tile_order);
return;
}
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
cs_trace_preamble(b, ctx, scratch_regs,
sizeof(struct cs_run_fragment2_trace));
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
* won't point to the right instruction. */
cs_load_ip_to(b, data);
cs_run_fragment2(b, enable_tem, tile_order);
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment2, ip));
ASSERTED unsigned sr_count = 0;
unsigned sr_offset = cs_trace_field_offset(run_fragment2, sr);
for (unsigned i = 0; i < CS_RUN_FRAGMENT2_SR_COUNT; i += 16) {
unsigned mask = (CS_RUN_FRAGMENT2_SR_MASK >> i) & BITFIELD_MASK(16);
if (!mask)
continue;
cs_store(b, cs_reg_tuple(b, i, util_last_bit(mask)), tracebuf_addr, mask,
sr_offset);
sr_offset += util_bitcount(mask) * sizeof(uint32_t);
sr_count += util_bitcount(mask);
}
assert(sr_count == CS_RUN_FRAGMENT2_SR_COUNT);
cs_flush_stores(b);
}
#else
struct cs_run_fragment_trace {
uint64_t ip;
uint32_t sr[7];
@ -2500,6 +2568,7 @@ cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
cs_trace_field_offset(run_fragment, sr));
cs_flush_stores(b);
}
#endif
#if PAN_ARCH >= 13
#define CS_RUN_FULLSCREEN_SR_MASK \

View file

@ -152,22 +152,22 @@ pandecode_rt(struct pandecode_context *ctx, unsigned index, uint64_t gpu_va)
}
static void
pandecode_rts(struct pandecode_context *ctx, uint64_t gpu_va,
const struct MALI_FRAMEBUFFER_PARAMETERS *fb)
void
GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va,
uint32_t render_target_count)
{
pandecode_log(ctx, "Color Render Targets @%" PRIx64 ":\n", gpu_va);
ctx->indent++;
for (int i = 0; i < (fb->render_target_count); i++)
for (int i = 0; i < render_target_count; i++)
pandecode_rt(ctx, i, gpu_va);
ctx->indent--;
pandecode_log(ctx, "\n");
}
static void
pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va)
void
GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va)
{
const struct mali_zs_crc_extension_packed *PANDECODE_PTR_VAR(
ctx, zs_crc_packed, (uint64_t)gpu_va);
@ -223,22 +223,65 @@ pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va)
#if PAN_ARCH >= 6
static void
pandecode_sample_locations(struct pandecode_context *ctx, const void *fb)
void
GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx,
uint64_t dcd_pointer, unsigned pre_frame_0,
unsigned pre_frame_1, unsigned post_frame,
unsigned job_type_param, uint64_t gpu_id)
{
pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params);
const unsigned dcd_size = pan_size(DRAW);
const uint16_t *PANDECODE_PTR_VAR(ctx, samples, params.sample_locations);
if (pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, dcd_pointer + (0 * dcd_size));
pan_unpack(dcd, DRAW, draw)
;
pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n", dcd_pointer,
pre_frame_0);
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n",
params.sample_locations);
if (pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, dcd_pointer + (1 * dcd_size));
pan_unpack(dcd, DRAW, draw)
;
pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n",
dcd_pointer + (1 * dcd_size));
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
if (post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, dcd_pointer + (2 * dcd_size));
pan_unpack(dcd, DRAW, draw)
;
pandecode_log(ctx, "Post frame:\n");
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
}
void
GENX(pandecode_sample_locations)(struct pandecode_context *ctx,
uint64_t sample_locations)
{
const uint16_t *PANDECODE_PTR_VAR(ctx, samples, sample_locations);
pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n", sample_locations);
for (int i = 0; i < 33; i++) {
pandecode_log(ctx, " (%d, %d),\n", samples[2 * i] - 128,
samples[2 * i + 1] - 128);
}
}
#endif
#endif /* PAN_ARCH >= 6 */
#if PAN_ARCH < 14
struct pandecode_fbd
GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
bool is_fragment, uint64_t gpu_id)
@ -248,46 +291,17 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
DUMP_UNPACKED(ctx, FRAMEBUFFER_PARAMETERS, params, "Parameters:\n");
#if PAN_ARCH >= 6
pandecode_sample_locations(ctx, fb);
GENX(pandecode_sample_locations)(ctx, params.sample_locations);
unsigned dcd_size = pan_size(DRAW);
unsigned job_type_param = 0;
#if PAN_ARCH <= 9
job_type_param = MALI_JOB_TYPE_FRAGMENT;
#endif
if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, params.frame_shader_dcds + (0 * dcd_size));
pan_unpack(dcd, DRAW, draw);
pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n",
params.frame_shader_dcds, params.pre_frame_0);
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, params.frame_shader_dcds + (1 * dcd_size));
pan_unpack(dcd, DRAW, draw);
pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n",
params.frame_shader_dcds + (1 * dcd_size));
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
const struct mali_draw_packed *PANDECODE_PTR_VAR(
ctx, dcd, params.frame_shader_dcds + (2 * dcd_size));
pan_unpack(dcd, DRAW, draw);
pandecode_log(ctx, "Post frame:\n");
ctx->indent++;
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
ctx->indent--;
}
GENX(pandecode_frame_shader_dcds)
(ctx, params.frame_shader_dcds, params.pre_frame_0, params.pre_frame_1,
params.post_frame, job_type_param, gpu_id);
#else
DUMP_SECTION(ctx, FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n");
@ -312,13 +326,13 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
gpu_va += pan_size(FRAMEBUFFER);
if (params.has_zs_crc_extension) {
pandecode_zs_crc_ext(ctx, gpu_va);
GENX(pandecode_zs_crc_ext)(ctx, gpu_va);
gpu_va += pan_size(ZS_CRC_EXTENSION);
}
if (is_fragment)
pandecode_rts(ctx, gpu_va, &params);
GENX(pandecode_rts)(ctx, gpu_va, params.render_target_count);
return (struct pandecode_fbd){
.rt_count = params.render_target_count,
@ -336,6 +350,7 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
};
#endif
}
#endif /* PAN_ARCH < 14 */
#if PAN_ARCH >= 5
uint64_t

View file

@ -132,6 +132,20 @@ void pandecode_cs_binary_v13(struct pandecode_context *ctx, uint64_t bin,
void pandecode_cs_trace_v13(struct pandecode_context *ctx, uint64_t trace,
uint32_t trace_size, uint64_t gpu_id);
void pandecode_interpret_cs_v14(struct pandecode_context *ctx, uint64_t queue,
uint32_t size, uint64_t gpu_id, uint32_t *regs);
void pandecode_cs_binary_v14(struct pandecode_context *ctx, uint64_t bin,
uint32_t bin_size);
void pandecode_cs_trace_v14(struct pandecode_context *ctx, uint64_t trace,
uint32_t trace_size, uint64_t gpu_id);
void pandecode_interpret_cs_v15(struct pandecode_context *ctx, uint64_t queue,
uint32_t size, uint64_t gpu_id, uint32_t *regs);
void pandecode_cs_binary_v15(struct pandecode_context *ctx, uint64_t bin,
uint32_t bin_size);
void pandecode_cs_trace_v15(struct pandecode_context *ctx, uint64_t trace,
uint32_t trace_size, uint64_t gpu_id);
/* Logging infrastructure */
static void
pandecode_make_indent(struct pandecode_context *ctx)
@ -275,4 +289,22 @@ void GENX(pandecode_depth_stencil)(struct pandecode_context *ctx,
#endif
#if PAN_ARCH >= 6
void GENX(pandecode_sample_locations)(struct pandecode_context *ctx,
uint64_t sample_locations);
void
GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx,
uint64_t dcd_pointer, unsigned pre_frame_0,
unsigned pre_frame_1, unsigned post_frame,
unsigned job_type_param, uint64_t gpu_id);
#endif
#if PAN_ARCH >= 5
void GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va,
uint32_t render_target_count);
void GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va);
#endif
#endif /* __MMAP_TRACE_H__ */

View file

@ -423,6 +423,12 @@ pandecode_interpret_cs(struct pandecode_context *ctx, uint64_t queue_gpu_va,
case 13:
pandecode_interpret_cs_v13(ctx, queue_gpu_va, size, gpu_id, regs);
break;
case 14:
pandecode_interpret_cs_v14(ctx, queue_gpu_va, size, gpu_id, regs);
break;
case 15:
pandecode_interpret_cs_v15(ctx, queue_gpu_va, size, gpu_id, regs);
break;
default:
UNREACHABLE("Unsupported architecture");
}
@ -446,6 +452,12 @@ pandecode_cs_binary(struct pandecode_context *ctx, uint64_t bin_gpu_va,
case 13:
pandecode_cs_binary_v13(ctx, bin_gpu_va, size);
break;
case 14:
pandecode_cs_binary_v14(ctx, bin_gpu_va, size);
break;
case 15:
pandecode_cs_binary_v15(ctx, bin_gpu_va, size);
break;
default:
UNREACHABLE("Unsupported architecture");
}
@ -469,6 +481,12 @@ pandecode_cs_trace(struct pandecode_context *ctx, uint64_t trace_gpu_va,
case 13:
pandecode_cs_trace_v13(ctx, trace_gpu_va, size, gpu_id);
break;
case 14:
pandecode_cs_trace_v14(ctx, trace_gpu_va, size, gpu_id);
break;
case 15:
pandecode_cs_trace_v15(ctx, trace_gpu_va, size, gpu_id);
break;
default:
UNREACHABLE("Unsupported architecture");
}

View file

@ -1,5 +1,6 @@
/*
* Copyright (C) 2022-2023 Collabora, Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
@ -117,8 +118,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
case MALI_CS_OPCODE_WAIT: {
cs_unpack(instr, CS_WAIT, I);
fprintf(fp, "WAIT%s #%x", I.progress_increment ? ".progress_inc" : "",
I.wait_mask);
fprintf(fp, "WAIT #%x", I.wait_mask);
break;
}
@ -130,15 +130,13 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
* since we'll print them implicitly later.
*/
#if PAN_ARCH >= 12
fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
I.progress_increment ? ".progress_inc" : "", axes[I.task_axis],
I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
I.task_increment, I.ep_limit);
fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select,
I.fau_select, I.task_increment, I.ep_limit);
#else
fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u",
I.progress_increment ? ".progress_inc" : "", axes[I.task_axis],
I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
I.task_increment);
fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u",
axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select,
I.fau_select, I.task_increment);
#endif
break;
}
@ -146,8 +144,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
#if PAN_ARCH == 10
case MALI_CS_OPCODE_RUN_TILING: {
cs_unpack(instr, CS_RUN_TILING, I);
fprintf(fp, "RUN_TILING%s.srt%d.spd%d.tsd%d.fau%d",
I.progress_increment ? ".progress_inc" : "", I.srt_select,
fprintf(fp, "RUN_TILING.srt%d.spd%d.tsd%d.fau%d", I.srt_select,
I.spd_select, I.tsd_select, I.fau_select);
break;
}
@ -158,8 +155,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
cs_unpack(instr, CS_RUN_IDVS, I);
fprintf(
fp,
"RUN_IDVS%s%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64,
I.progress_increment ? ".progress_inc" : "",
"RUN_IDVS%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64,
I.malloc_enable ? "" : ".no_malloc",
I.draw_id_register_enable ? ".draw_id_enable" : "",
I.varying_srt_select, I.varying_fau_select, I.varying_tsd_select,
@ -178,8 +174,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
".INVALID",
};
fprintf(fp, "RUN_IDVS2%s%s%s%s r%u, #%" PRIx64,
I.progress_increment ? ".progress_inc" : "",
fprintf(fp, "RUN_IDVS2%s%s%s r%u, #%" PRIx64,
I.malloc_enable ? "" : ".no_malloc",
I.draw_id_register_enable ? ".draw_id_enable" : "",
vertex_shading_str[I.vertex_shading_mode], I.draw_id,
@ -318,31 +313,36 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
case MALI_CS_OPCODE_SHARED_SB_INC: {
cs_unpack(instr, CS_SHARED_SB_INC, I);
const char *progress_increment_name[] = {
".no_increment",
".increment",
};
fprintf(fp, "SHARED_SB_INC%s%s #%u, #%u",
progress_increment_name[I.progress_increment],
defer_mode_str(I), I.sb_mask, I.shared_entry);
fprintf(fp, "SHARED_SB_INC%s #%u, #%u", defer_mode_str(I), I.sb_mask,
I.shared_entry);
break;
}
case MALI_CS_OPCODE_SHARED_SB_DEC: {
cs_unpack(instr, CS_SHARED_SB_DEC, I);
const char *progress_increment_name[] = {
".no_increment",
".increment",
};
fprintf(fp, "SHARED_SB_DEC%s #%u",
progress_increment_name[I.progress_increment], I.shared_entry);
fprintf(fp, "SHARED_SB_DEC #%u", I.shared_entry);
break;
}
#endif
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2: {
static const char *tile_order[] = {
"zorder", "horizontal", "vertical", "unknown",
"unknown", "rev_horizontal", "rev_vertical", "unknown",
"unknown", "unknown", "unknown", "unknown",
"unknown", "unknown", "unknown", "unknown",
};
cs_unpack(instr, CS_RUN_FRAGMENT2, I);
fprintf(fp, "RUN_FRAGMENT2%s.tile_order=%s",
I.enable_tem ? ".tile_enable_map_enable" : "",
tile_order[I.tile_order]);
break;
}
#else
case MALI_CS_OPCODE_RUN_FRAGMENT: {
static const char *tile_order[] = {
"zorder", "horizontal", "vertical", "unknown",
@ -350,27 +350,25 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
"unknown", "unknown", "unknown", "unknown",
"unknown", "unknown", "unknown", "unknown",
};
cs_unpack(instr, CS_RUN_FRAGMENT, I);
fprintf(fp, "RUN_FRAGMENT%s%s.tile_order=%s",
I.progress_increment ? ".progress_inc" : "",
fprintf(fp, "RUN_FRAGMENT%s.tile_order=%s",
I.enable_tem ? ".tile_enable_map_enable" : "",
tile_order[I.tile_order]);
break;
}
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN: {
cs_unpack(instr, CS_RUN_FULLSCREEN, I);
fprintf(fp, "RUN_FULLSCREEN%s r%u, #%" PRIx64,
I.progress_increment ? ".progress_inc" : "", I.dcd,
I.flags_override);
fprintf(fp, "RUN_FULLSCREEN r%u, #%" PRIx64, I.dcd, I.flags_override);
break;
}
case MALI_CS_OPCODE_FINISH_TILING: {
cs_unpack(instr, CS_FINISH_TILING, I);
fprintf(fp, "FINISH_TILING%s",
I.progress_increment ? ".progress_inc" : "");
fprintf(fp, "FINISH_TILING");
break;
}
@ -443,12 +441,6 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
break;
}
case MALI_CS_OPCODE_PROGRESS_WAIT: {
cs_unpack(instr, CS_PROGRESS_WAIT, I);
fprintf(fp, "PROGRESS_WAIT d%u, #%u", I.source, I.queue);
break;
}
case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: {
cs_unpack(instr, CS_SET_EXCEPTION_HANDLER, I);
fprintf(fp, "SET_EXCEPTION_HANDLER d%u, r%u", I.address, I.length);
@ -547,29 +539,16 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
break;
}
case MALI_CS_OPCODE_PROGRESS_STORE: {
cs_unpack(instr, CS_PROGRESS_STORE, I);
fprintf(fp, "PROGRESS_STORE d%u", I.source);
break;
}
case MALI_CS_OPCODE_PROGRESS_LOAD: {
cs_unpack(instr, CS_PROGRESS_LOAD, I);
fprintf(fp, "PROGRESS_LOAD d%u", I.destination);
break;
}
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
cs_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I);
#if PAN_ARCH >= 12
fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
I.progress_increment ? ".progress_inc" : "", I.srt_select,
I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task,
I.ep_limit);
fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
I.workgroups_per_task, I.ep_limit);
#else
fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u",
I.progress_increment ? ".progress_inc" : "", I.srt_select,
I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task);
fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u",
I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
I.workgroups_per_task);
#endif
break;
@ -672,8 +651,19 @@ pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
if (fau)
GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
GENX(pandecode_shader)
(ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
uint64_t addr = cs_get_u64(qctx, reg_spd);
#if PAN_ARCH >= 15
const struct mali_shader_program_pointer_packed spp_packed = {
.opaque[0] = addr & 0xFFFFFFFF,
.opaque[1] = (addr >> 32) & 0xFFFFFFFF,
};
pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp)
;
DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp,
"Shader Program Pointer (%" PRIx64 "):\n", addr);
addr = spp.pointer;
#endif
GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id);
DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
"Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
@ -714,8 +704,19 @@ pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp,
if (fau)
GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
GENX(pandecode_shader)
(ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
uint64_t addr = cs_get_u64(qctx, reg_spd);
#if PAN_ARCH >= 15
const struct mali_shader_program_pointer_packed spp_packed = {
.opaque[0] = addr & 0xFFFFFFFF,
.opaque[1] = (addr >> 32) & 0xFFFFFFFF,
};
pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp)
;
DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp,
"Shader Program Pointer (%" PRIx64 "):\n", addr);
addr = spp.pointer;
#endif
GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id);
DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
"Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
@ -1097,6 +1098,101 @@ pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
}
#endif
#if PAN_ARCH >= 14
static void
pandecode_run_fragment2(struct pandecode_context *ctx, FILE *fp,
struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT2 *I)
{
if (qctx->in_exception_handler)
return;
ctx->indent++;
pandecode_log(ctx, "Iter trace ID0: %" PRIu32 "\n",
cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID0));
pandecode_log(ctx, "Iter trace ID1: %" PRIu32 "\n",
cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID1));
pandecode_log(ctx, "TEM pointer: %" PRIx64 "\n",
cs_get_u64(qctx, MALI_FRAGMENT_SR_TEM_POINTER));
pandecode_log(ctx, "TEM row stride: %" PRIu32 "\n",
cs_get_u32(qctx, MALI_FRAGMENT_SR_TEM_ROW_STRIDE));
for (unsigned i = 0; i < 11; ++i) {
const unsigned reg = MALI_FRAGMENT_SR_IRD_BUFFER_POINTER_0 + (i * 2);
pandecode_log(ctx, "IRD buffer pointer %u: %" PRIx64 "\n", i,
cs_get_u64(qctx, reg));
}
DUMP_CL(ctx, FRAGMENT_FLAGS_3, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_3],
"Flags 3:\n");
DUMP_CL(ctx, FRAGMENT_BOUNDING_BOX,
&qctx->regs[MALI_FRAGMENT_SR_BOUNDING_BOX], "Bounding Box:\n");
DUMP_CL(ctx, FRAME_SIZE, &qctx->regs[MALI_FRAGMENT_SR_FRAME_SIZE],
"Frame size:\n");
pan_unpack((const struct mali_fragment_flags_0_packed *)&qctx
->regs[MALI_FRAGMENT_SR_FLAGS_0],
FRAGMENT_FLAGS_0, flags0_unpacked)
;
DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_0, flags0_unpacked, "Flags 0:\n");
pan_unpack((const struct mali_fragment_flags_1_packed *)&qctx
->regs[MALI_FRAGMENT_SR_FLAGS_1],
FRAGMENT_FLAGS_1, flags1_unpacked)
;
DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_1, flags1_unpacked, "Flags 1:\n");
DUMP_CL(ctx, FRAGMENT_FLAGS_2, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_2],
"Flags 2:\n");
pandecode_log(ctx, "Z clear: %f\n",
uif(cs_get_u32(qctx, MALI_FRAGMENT_SR_Z_CLEAR)));
const uint64_t tiler_pointer =
cs_get_u64(qctx, MALI_FRAGMENT_SR_TILER_DESCRIPTOR_POINTER);
pandecode_log(ctx, "Tiler descriptor pointer: 0x%" PRIx64 "\n",
tiler_pointer);
const uint64_t rtd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_RTD_POINTER);
pandecode_log(ctx, "RTD pointer: 0x%" PRIx64 "\n", rtd_pointer);
const uint64_t dbd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_DBD_POINTER);
pandecode_log(ctx, "DBD pointer: 0x%" PRIx64 "\n", dbd_pointer);
pandecode_log(ctx, "Frame argument: %" PRIx64 "\n",
cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_ARG));
const uint64_t sample_locations =
cs_get_u64(qctx, MALI_FRAGMENT_SR_SAMPLE_POSITION_ARRAY_POINTER);
pandecode_log(ctx, "Sample locations: 0x%" PRIx64 "\n", sample_locations);
const uint64_t dcd_pointer =
cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_SHADER_DCD_POINTER);
pandecode_log(ctx, "Frame shader DCD pointer: 0x%" PRIx64 "\n", dcd_pointer);
DUMP_CL(ctx, VRS_IMAGE, &qctx->regs[MALI_FRAGMENT_SR_VRS_IMAGE],
"VRS image:\n");
GENX(pandecode_sample_locations)
(ctx, sample_locations);
const unsigned job_type_param = 0;
GENX(pandecode_frame_shader_dcds)
(ctx, dcd_pointer, flags0_unpacked.pre_frame_0, flags0_unpacked.pre_frame_1,
flags0_unpacked.post_frame, job_type_param, qctx->gpu_id);
if (tiler_pointer)
GENX(pandecode_tiler)(ctx, tiler_pointer);
if (dbd_pointer)
GENX(pandecode_zs_crc_ext)(ctx, dbd_pointer);
if (rtd_pointer)
GENX(pandecode_rts)
(ctx, rtd_pointer, flags1_unpacked.render_target_count);
ctx->indent--;
}
#else
static void
pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I)
@ -1115,6 +1211,7 @@ pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
ctx->indent--;
}
#endif /* PAN_ARCH >= 14 */
static void
pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp,
@ -1261,11 +1358,19 @@ interpret_cs_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
}
#endif
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2: {
cs_unpack(bytes, CS_RUN_FRAGMENT2, I);
pandecode_run_fragment2(ctx, fp, qctx, &I);
break;
}
#else
case MALI_CS_OPCODE_RUN_FRAGMENT: {
cs_unpack(bytes, CS_RUN_FRAGMENT, I);
pandecode_run_fragment(ctx, fp, qctx, &I);
break;
}
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN: {
cs_unpack(bytes, CS_RUN_FULLSCREEN, I);
@ -2192,18 +2297,6 @@ collect_indirect_branch_targets_recurse(struct cs_code_cfg *cfg,
break;
}
case MALI_CS_OPCODE_PROGRESS_LOAD: {
cs_unpack(instr, CS_PROGRESS_LOAD, I);
for (unsigned i = 0; i < 16; i++) {
if (BITSET_TEST(track_map, I.destination) ||
BITSET_TEST(track_map, I.destination + 1)) {
ibranch->has_unknown_targets = true;
return;
}
}
break;
}
default:
break;
}
@ -2430,7 +2523,12 @@ print_cs_binary(struct pandecode_context *ctx, uint64_t bin,
#else
case MALI_CS_OPCODE_RUN_IDVS:
#endif
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2:
#else
case MALI_CS_OPCODE_RUN_FRAGMENT:
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN:
case MALI_CS_OPCODE_RUN_COMPUTE:
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
@ -2539,6 +2637,19 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
}
#endif
#if PAN_ARCH >= 14
case MALI_CS_OPCODE_RUN_FRAGMENT2: {
struct cs_run_fragment2_trace *frag_trace = trace_data;
assert(trace_size >= sizeof(*frag_trace));
cs_unpack(instr, CS_RUN_FRAGMENT2, I);
memcpy(&regs[0], frag_trace->sr, sizeof(frag_trace->sr));
pandecode_run_fragment2(ctx, ctx->dump_stream, &qctx, &I);
trace_data = frag_trace + 1;
trace_size -= sizeof(*frag_trace);
break;
}
#else
case MALI_CS_OPCODE_RUN_FRAGMENT: {
struct cs_run_fragment_trace *frag_trace = trace_data;
@ -2550,6 +2661,7 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
trace_size -= sizeof(*frag_trace);
break;
}
#endif
case MALI_CS_OPCODE_RUN_FULLSCREEN: {
struct cs_run_fullscreen_trace *fs_trace = trace_data;

View file

@ -61,6 +61,12 @@
#elif (PAN_ARCH == 13)
#define GENX(X) X##_v13
#include "genxml/v13_pack.h"
#elif (PAN_ARCH == 14)
#define GENX(X) X##_v14
#include "genxml/v14_pack.h"
#elif (PAN_ARCH == 15)
#define GENX(X) X##_v15
#include "genxml/v15_pack.h"
#else
#error "Need to add suffixing macro for this architecture"
#endif

View file

@ -83,23 +83,34 @@ def parse_modifier(modifier):
if modifier is None:
return None
for mod in MODIFIERS:
if modifier[0:len(mod)] == mod:
if mod == "log2":
assert(len(mod) == len(modifier))
return [mod]
ret = []
split_modifiers = modifier.split()
if modifier[len(mod)] == '(' and modifier[-1] == ')':
ret = [mod, int(modifier[(len(mod) + 1):-1])]
if ret[0] == 'align':
align = ret[1]
# Make sure the alignment is a power of 2
assert(align > 0 and not(align & (align - 1)));
for mod in split_modifiers:
valid = False
for valid_mod in MODIFIERS:
if mod[0:len(valid_mod)] == valid_mod:
if valid_mod == "log2":
assert(len(valid_mod) == len(modifier))
# Add a number to simplify parsing
ret.extend([valid_mod, 0])
valid = True
break
return ret
if mod[len(valid_mod)] == '(' and mod[-1] == ')':
mod_arg = [valid_mod, int(mod[(len(valid_mod) + 1):-1])]
if mod_arg[0] == 'align':
align = mod_arg[1]
# Make sure the alignment is a power of 2
assert(align > 0 and not(align & (align - 1)));
print("Invalid modifier")
assert(False)
ret.extend(mod_arg)
valid = True
break
assert valid, f"Invalid modifier: {modifier}"
return ret
class Aggregate(object):
def __init__(self, parser, name, attrs):
@ -169,7 +180,7 @@ class Field(object):
if self.type in self.parser.enums and self.default is not None:
self.default = safe_name('{}_{}_{}'.format(global_prefix, self.type, self.default)).upper()
self.modifier = parse_modifier(attrs.get("modifier"))
self.modifier = parse_modifier(attrs.get("modifier"))
def emit_template_struct(self, dim):
if self.type == 'address':
@ -291,14 +302,22 @@ class Group(object):
if field.modifier is None:
continue
if field.modifier[0] == "shr":
shift = field.modifier[1]
mask = hex((1 << shift) - 1)
print(" assert(((__unpacked)->{} & {}) == 0); \\".format(field.name, mask))
elif field.modifier[0] == "minus":
print(" assert((__unpacked)->{} >= {}); \\".format(field.name, field.modifier[1]))
elif field.modifier[0] == "log2":
print(" assert(IS_POT_NONZERO((__unpacked)->{})); \\".format(field.name))
value = "(__unpacked)->{}".format(field.name)
for mod, mod_val in zip (field.modifier[::2], field.modifier[1::2]):
if mod == "shr":
mask = hex((1 << mod_val) - 1)
print(" assert(({} & {}) == 0); \\".format(value, mask))
value = "({} >> {})".format(value, mod_val)
elif mod == "minus":
print(" assert({} >= {}); \\".format(value, mod_val))
value = "({} - {})".format(value, mod_val)
elif mod == "align":
mask = hex(mod_val - 1)
print(' assert(!({} & {})); \\'.format(value, mask))
value = "(ALIGN_POT({}, {}))".format(value, mod_val)
elif mod == "log2":
print(" assert(IS_POT_NONZERO({})); \\".format(value))
value = "(util_logbase2({}))".format(value)
for index in range(self.length // 4):
# Handle MBZ words
@ -324,14 +343,15 @@ class Group(object):
value = "(__unpacked)->{}".format(contributor.path)
if field.modifier is not None:
if field.modifier[0] == "shr":
value = "{} >> {}".format(value, field.modifier[1])
elif field.modifier[0] == "minus":
value = "{} - {}".format(value, field.modifier[1])
elif field.modifier[0] == "align":
value = "ALIGN_POT({}, {})".format(value, field.modifier[1])
elif field.modifier[0] == "log2":
value = "util_logbase2({})".format(value)
for mod, mod_val in zip(field.modifier[::2], field.modifier[1::2]):
if mod == "shr":
value = "({} >> {})".format(value, mod_val)
elif mod == "minus":
value = "({} - {})".format(value, mod_val)
elif mod == "align":
value = "(ALIGN_POT({}, {}))".format(value, mod_val)
elif mod == "log2":
value = "(util_logbase2({}))".format(value)
if field.type in ["uint", "hex", "uint/float", "address", "Pixel Format", "Component Swizzle"]:
s = "util_bitpack_uint(%s, %d, %d)" % \
@ -435,25 +455,24 @@ class Group(object):
else:
s = "/* unhandled field %s, type %s */\n" % (field.name, field.type)
suffix = ""
prefix = ""
if field.modifier:
if field.modifier[0] == "minus":
suffix = " + {}".format(field.modifier[1])
elif field.modifier[0] == "shr":
suffix = " << {}".format(field.modifier[1])
if field.modifier[0] == "log2":
prefix = "1U << "
print(' {}({}); \\'.format(convert, ', '.join(args)))
if len(prefix) != 0 or len(suffix) != 0:
print(' (__unpacked)->{} = {}(__unpacked)->{}{}; \\'.format(fieldref.path, prefix, fieldref.path, suffix))
value = "(__unpacked)->{}".format(fieldref.path)
if field.modifier is not None:
# Need to reverse ([::-1]) modifier order when unpacking
for mod, mod_val in list(zip(field.modifier[::2], field.modifier[1::2]))[::-1]:
if mod == "shr":
value = "({} << {})".format(value, mod_val)
elif mod == "minus":
value = "({} + {})".format(value, mod_val)
elif mod == "align":
mask = hex(mod_val - 1)
print(' assert(!({} & {})); \\'.format(value, mask))
elif mod == "log2":
value = "(1U << {})".format(value)
if field.modifier and field.modifier[0] == "align":
mask = hex(field.modifier[1] - 1)
print(' assert(!((__unpacked)->{} & {})); \\'.format(fieldref.path, mask))
print(' (__unpacked)->{} = {}; \\'.format(fieldref.path, value))
def emit_print_function(self):
for field in self.fields:

View file

@ -3,7 +3,7 @@
# SPDX-License-Identifier: MIT
pan_packers = []
foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13']
foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13', 'v14', 'v15']
pan_packers += custom_target(
packer + '_pack.h',
input : ['gen_pack.py', packer + '.xml'],
@ -20,7 +20,7 @@ idep_pan_packers = declare_dependency(
libpanfrost_decode_per_arch = []
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13']
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
libpanfrost_decode_per_arch += static_library(
'pandecode-arch-v' + ver,
['decode.c', 'decode_jm.c', 'decode_csf.c', pan_packers],

View file

@ -1,5 +1,6 @@
<!--
Copyright (C) 2020 Collabora Ltd.
Copyright (C) 2026 Arm Ltd.
SPDX-License-Identifier: MIT
-->
@ -84,6 +85,7 @@
<enum name="Address Mode">
<value name="Flat" value="0"/>
<value name="Packed" value="1"/>
<value name="Out of bounds" value="8"/>
</enum>
<enum name="Format">
@ -132,6 +134,7 @@
<value name="A2 YUV10" value="41"/>
<value name="YUYAAYVYAA" value="42"/>
<!--- TODO: revisit YUV -->
<value name="Y10U10V10_420" value="43"/>
<value name="YUYV10" value="44"/>
<value name="VYUY10" value="45"/>
<value name="Y10 UV10 422" value="46"/>
@ -1163,6 +1166,13 @@
<enum name="Clump Ordering">
<value name="Tiled U-Interleaved" value="1"/>
<value name="Linear" value="2"/>
<!-- Block-linear interleaved clump orderings are not available on
all v10 architectures. -->
<value name="Block-linear interleaved 16x16" value="3"/>
<value name="Block-linear interleaved 8x16" value="4"/>
<value name="Block-linear interleaved 8x8" value="5"/>
<value name="Interleaved 64k" value="8"/>
</enum>

View file

@ -1,5 +1,6 @@
<!--
Copyright (C) 2025 Collabora Ltd.
Copyright (C) 2026 Arm Ltd.
SPDX-License-Identifier: MIT
-->
@ -84,6 +85,7 @@
<enum name="Address Mode">
<value name="Flat" value="0"/>
<value name="Packed" value="1"/>
<value name="Out of bounds" value="8"/>
</enum>
<enum name="Format">
@ -132,6 +134,7 @@
<value name="A2 YUV10" value="41"/>
<value name="YUYAAYVYAA" value="42"/>
<!--- TODO: revisit YUV -->
<value name="Y10U10V10_420" value="43"/>
<value name="YUYV10" value="44"/>
<value name="VYUY10" value="45"/>
<value name="Y10 UV10 422" value="46"/>
@ -1426,6 +1429,9 @@
<enum name="Clump Ordering">
<value name="Tiled U-Interleaved" value="1"/>
<value name="Linear" value="2"/>
<value name="Block-linear interleaved 16x16" value="3"/>
<value name="Block-linear interleaved 8x16" value="4"/>
<value name="Block-linear interleaved 8x8" value="5"/>
<value name="Interleaved 64k" value="8"/>
</enum>

View file

@ -1,5 +1,6 @@
<!--
Copyright (C) 2025 Collabora Ltd.
Copyright (C) 2026 Arm Ltd.
SPDX-License-Identifier: MIT
-->
@ -84,6 +85,7 @@
<enum name="Address Mode">
<value name="Flat" value="0"/>
<value name="Packed" value="1"/>
<value name="Out of bounds" value="8"/>
</enum>
<enum name="Format">
@ -132,6 +134,7 @@
<value name="A2 YUV10" value="41"/>
<value name="YUYAAYVYAA" value="42"/>
<!--- TODO: revisit YUV -->
<value name="Y10U10V10_420" value="43"/>
<value name="YUYV10" value="44"/>
<value name="VYUY10" value="45"/>
<value name="Y10 UV10 422" value="46"/>
@ -1728,6 +1731,9 @@
<enum name="Clump Ordering">
<value name="Tiled U-Interleaved" value="1"/>
<value name="Linear" value="2"/>
<value name="Block-linear interleaved 16x16" value="3"/>
<value name="Block-linear interleaved 8x16" value="4"/>
<value name="Block-linear interleaved 8x8" value="5"/>
<value name="Interleaved 64k" value="8"/>
</enum>

2753
src/panfrost/genxml/v14.xml Normal file

File diff suppressed because it is too large Load diff

2759
src/panfrost/genxml/v15.xml Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,6 @@
<!--
Copyright (C) 2020 Collabora Ltd.
Copyright (C) 2026 Arm Ltd.
SPDX-License-Identifier: MIT
-->
@ -103,6 +104,7 @@
<enum name="Address Mode">
<value name="Flat" value="0"/>
<value name="Packed" value="1"/>
<value name="Out of bounds" value="8"/>
</enum>
<enum name="Format">

View file

@ -206,6 +206,9 @@ struct pan_kmod_dev_props {
/* Maximum number of threads per workgroup. */
uint32_t max_threads_per_wg;
/* Granularity of number of active threads. */
uint32_t num_threads_active_granularity;
/* Number of registers per core. Can be used to determine the maximum
* number of threads that can be allocated for a specific shader based on
* the number of registers assigned to this shader.

View file

@ -133,13 +133,17 @@ panthor_dev_query_thread_props(struct panthor_kmod_dev *panthor_dev)
props->max_tasks_per_core = panthor_dev->props.gpu.thread_features >> 24;
props->num_registers_per_core =
panthor_dev->props.gpu.thread_features & 0x3fffff;
props->num_threads_active_granularity =
panthor_dev->props.gpu.thread_num_active_granularity;
/* We assume that all thread properties are populated. If we ever have a GPU
* that have one of the THREAD_xxx register that's zero, we can always add a
* quirk here.
*/
assert(props->max_threads_per_wg && props->max_threads_per_core &&
props->max_tasks_per_core && props->num_registers_per_core);
assert(
(props->max_threads_per_wg || props->num_threads_active_granularity) &&
props->max_threads_per_core && props->max_tasks_per_core &&
props->num_registers_per_core);
/* There is no THREAD_TLS_ALLOC register on v10+, and the maximum number
* of TLS instance per core is assumed to be the maximum number of threads
@ -153,8 +157,12 @@ panthor_dev_query_props(struct panthor_kmod_dev *panthor_dev)
{
struct pan_kmod_dev_props *props = &panthor_dev->base.props;
bool is_gpu_wide = panthor_dev->props.gpu.gpu_id == 0;
assert(!is_gpu_wide || panthor_dev->props.gpu.gpu_wide_id);
*props = (struct pan_kmod_dev_props){
.gpu_id = panthor_dev->props.gpu.gpu_id,
.gpu_id = is_gpu_wide ? panthor_dev->props.gpu.gpu_wide_id
: panthor_dev->props.gpu.gpu_id,
.gpu_variant = panthor_dev->props.gpu.core_features & 0xff,
.shader_present = panthor_dev->props.gpu.shader_present,
.tiler_features = panthor_dev->props.gpu.tiler_features,

View file

@ -4,7 +4,7 @@
subdir('kmod')
pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13']
pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13', '14', '15']
libpanfrost_pixel_format = []
deps_for_libpanfrost = [dep_libdrm, idep_pan_packers, idep_mesautil, libpanfrost_model_dep]
@ -22,7 +22,7 @@ endforeach
libpanfrost_per_arch = []
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13']
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
libpanfrost_per_arch += static_library(
'pan-arch-v' + ver,
[

View file

@ -3,6 +3,7 @@
* Copyright (C) 2014 Broadcom
* Copyright (C) 2018-2019 Alyssa Rosenzweig
* Copyright (C) 2019-2020 Collabora, Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
@ -711,6 +712,32 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode)
case PAN_AFBC_MODE_R16G16B16A16:
return MALI_AFBC_COMPRESSION_MODE_R16G16B16A16;
#endif
#if PAN_ARCH >= 14
case PAN_AFBC_MODE_YUV420_6C8:
return MALI_AFBC_COMPRESSION_MODE_Y8U8V8_420;
case PAN_AFBC_MODE_YUV420_2C8:
return MALI_AFBC_COMPRESSION_MODE_R8G8;
case PAN_AFBC_MODE_YUV420_1C8:
return MALI_AFBC_COMPRESSION_MODE_R8;
case PAN_AFBC_MODE_YUV420_6C10:
return MALI_AFBC_COMPRESSION_MODE_Y10U10V10_420;
case PAN_AFBC_MODE_YUV420_2C10:
return MALI_AFBC_COMPRESSION_MODE_R10G10;
case PAN_AFBC_MODE_YUV420_1C10:
return MALI_AFBC_COMPRESSION_MODE_R10;
case PAN_AFBC_MODE_YUV422_4C8:
return MALI_AFBC_COMPRESSION_MODE_Y8U8Y8V8_422;
case PAN_AFBC_MODE_YUV422_2C8:
return MALI_AFBC_COMPRESSION_MODE_R8G8;
case PAN_AFBC_MODE_YUV422_1C8:
return MALI_AFBC_COMPRESSION_MODE_R8;
case PAN_AFBC_MODE_YUV422_4C10:
return MALI_AFBC_COMPRESSION_MODE_Y10U10Y10V10_422;
case PAN_AFBC_MODE_YUV422_2C10:
return MALI_AFBC_COMPRESSION_MODE_R10G10;
case PAN_AFBC_MODE_YUV422_1C10:
return MALI_AFBC_COMPRESSION_MODE_R10;
#else
case PAN_AFBC_MODE_YUV420_6C8:
return MALI_AFBC_COMPRESSION_MODE_YUV420_6C8;
case PAN_AFBC_MODE_YUV420_2C8:
@ -735,6 +762,7 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode)
return MALI_AFBC_COMPRESSION_MODE_YUV422_2C10;
case PAN_AFBC_MODE_YUV422_1C10:
return MALI_AFBC_COMPRESSION_MODE_YUV422_1C10;
#endif /* PAN_ARCH >= 14 */
#if PAN_ARCH == 9
case PAN_AFBC_MODE_R16:
case PAN_AFBC_MODE_R16G16:

View file

@ -1,5 +1,6 @@
/*
* Copyright (C) 2023 Collabora, Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
@ -347,6 +348,25 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier,
return (scan ? MALI_AFRC_FORMAT_R10G10B10A10_SCAN
: MALI_AFRC_FORMAT_R10G10B10A10_ROT);
#if PAN_ARCH >= 14
case PAN_AFRC_ICHANGE_FORMAT_YUV444:
case PAN_AFRC_ICHANGE_FORMAT_YUV422:
case PAN_AFRC_ICHANGE_FORMAT_YUV420:
if (info.bpc == 8) {
if (plane == 0 || info.num_planes == 3)
return (scan ? MALI_AFRC_FORMAT_R8_SCAN : MALI_AFRC_FORMAT_R8_ROT);
return (scan ? MALI_AFRC_FORMAT_R8G8_SCAN : MALI_AFRC_FORMAT_R8G8_ROT);
}
if (plane == 0 || info.num_planes == 3)
return (scan ? MALI_AFRC_FORMAT_R10_SCAN : MALI_AFRC_FORMAT_R10_ROT);
assert(info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV422 ||
info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV420);
return (scan ? MALI_AFRC_FORMAT_R10G10_SCAN
: MALI_AFRC_FORMAT_R10G10_ROT);
#else
case PAN_AFRC_ICHANGE_FORMAT_YUV444:
if (info.bpc == 8) {
if (plane == 0 || info.num_planes == 3)
@ -394,6 +414,7 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier,
return (scan ? MALI_AFRC_FORMAT_R10G10_420_SCAN
: MALI_AFRC_FORMAT_R10G10_420_ROT);
#endif /* PAN_ARCH >= 14 */
default:
return MALI_AFRC_FORMAT_INVALID;

View file

@ -1,5 +1,6 @@
/*
* Copyright (C) 2021 Collabora, Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
@ -11,6 +12,7 @@
#include "pan_afrc.h"
#include "pan_desc.h"
#include "pan_encoder.h"
#include "pan_fb.h"
#include "pan_props.h"
#include "pan_texture.h"
#include "pan_trace.h"
@ -1172,11 +1174,156 @@ check_fb_attachments(const struct pan_fb_info *fb)
#endif
}
#if PAN_ARCH >= 14
unsigned
GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
const struct pan_tls_info *tls,
const struct pan_tiler_context *tiler_ctx, void *out)
const struct pan_tiler_context *tiler_ctx,
const struct pan_ptr framebuffer)
{
void *out = framebuffer.cpu;
PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC);
check_fb_attachments(fb);
const int crc_rt = GENX(pan_select_crc_rt)(fb, fb->tile_size);
const bool has_zs_crc_ext = (fb->zs.view.zs || fb->zs.view.s || crc_rt >= 0);
const struct pan_clean_tile clean_tile = pan_get_clean_tile_info(fb);
/* Emit to memory the state that might change per-layer. The static
* state is emitted directly to CSF registers by
* cs_emit_static_fragment_state().
*/
struct pan_fbd_layer fbd_data = {0};
fbd_data.tiler = tiler_ctx->valhall.desc;
/* internal_layer_index in flags0 is used to select the right
* primitive list in the tiler context, and frame_arg is the value
* that's passed to the fragment shader through r62-r63, which we use
* to pass gl_Layer. Since the layer_idx only takes 8-bits, we might
* use the extra 56-bits we have in frame_argument to pass other
* information to the fragment shader at some point.
*/
assert(layer_idx >= tiler_ctx->valhall.layer_offset);
fbd_data.frame_argument = layer_idx;
pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) {
cfg.pre_frame_0 =
pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0],
pan_clean_tile_write_any_set(clean_tile));
cfg.pre_frame_1 =
pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[1],
pan_clean_tile_write_any_set(clean_tile));
cfg.post_frame = fb->bifrost.pre_post.modes[2];
const unsigned zs_bytes_per_pixel = pan_zsbuf_bytes_per_pixel(fb);
/* We can interleave HSR if we have space for two ZS tiles in
* the tile buffer. */
const unsigned max_zs_tile_size_interleave =
fb->z_tile_buf_budget >> util_logbase2_ceil(zs_bytes_per_pixel);
const bool hsr_can_interleave =
fb->tile_size <= max_zs_tile_size_interleave;
/* Enabling prepass without interleave is generally not good for
* performance, so disable HSR in that case. */
cfg.hsr_prepass_enable = fb->allow_hsr_prepass && hsr_can_interleave;
cfg.hsr_prepass_interleaving_enable = hsr_can_interleave;
cfg.hsr_prepass_filter_enable = true;
cfg.hsr_hierarchical_optimizations_enable = true;
cfg.internal_layer_index = layer_idx - tiler_ctx->valhall.layer_offset;
}
fbd_data.dcd_pointer = fb->bifrost.pre_post.dcds.gpu;
pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) {
cfg.s_clear = fb->zs.clear_value.stencil;
cfg.s_write_enable = (fb->zs.view.s && !fb->zs.discard.s);
/* Default to 24 bit depth if there's no surface. */
cfg.z_internal_format =
fb->zs.view.zs ? pan_get_z_internal_format(fb->zs.view.zs->format)
: MALI_Z_INTERNAL_FORMAT_D24;
cfg.z_write_enable = (fb->zs.view.zs && !fb->zs.discard.z);
if (crc_rt >= 0) {
bool *valid = fb->rts[crc_rt].crc_valid;
bool full = !fb->draw_extent.minx && !fb->draw_extent.miny &&
fb->draw_extent.maxx == (fb->width - 1) &&
fb->draw_extent.maxy == (fb->height - 1);
/* If the CRC was valid it stays valid, if it wasn't, we must
* ensure the render operation covers the full frame, and
* clean tiles are pushed to memory. */
bool new_valid = *valid | (full && pan_clean_tile_write_rt_enabled(
clean_tile, crc_rt));
cfg.crc_read_enable = *valid;
/* If the data is currently invalid, still write CRC
* data if we are doing a full write, so that it is
* valid for next time. */
cfg.crc_write_enable = new_valid;
*valid = new_valid;
}
}
fbd_data.z_clear = util_bitpack_float(fb->zs.clear_value.depth);
{
/* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */
uint64_t out_gpu_addr =
framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
if (has_zs_crc_ext) {
fbd_data.dbd_pointer = out_gpu_addr;
assert(fbd_data.dbd_pointer % 64 == 0);
out_gpu_addr += pan_size(ZS_CRC_EXTENSION);
}
fbd_data.rtd_pointer = out_gpu_addr;
assert(fbd_data.rtd_pointer % 64 == 0);
}
memcpy(out, &fbd_data, sizeof(fbd_data));
out += ALIGN_POT(sizeof(fbd_data), 64);
if (has_zs_crc_ext) {
struct mali_zs_crc_extension_packed *zs_crc_ext = out;
pan_emit_zs_crc_ext(fb, layer_idx, crc_rt, zs_crc_ext, clean_tile);
out += pan_size(ZS_CRC_EXTENSION);
}
const unsigned rt_count = MAX2(fb->rt_count, 1);
unsigned cbuf_offset = 0;
for (unsigned i = 0; i < rt_count; i++) {
pan_emit_rt(fb, layer_idx, i, cbuf_offset, out, clean_tile);
out += pan_size(RENDER_TARGET);
if (!fb->rts[i].view)
continue;
cbuf_offset += pan_bytes_per_pixel_tib(fb->rts[i].view->format) *
fb->tile_size *
pan_image_view_get_nr_samples(fb->rts[i].view);
if (i != crc_rt && fb->rts[i].crc_valid != NULL)
*(fb->rts[i].crc_valid) = false;
}
return 0;
}
#else
unsigned
GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
const struct pan_tls_info *tls,
const struct pan_tiler_context *tiler_ctx,
const struct pan_ptr framebuffer)
{
void *out = framebuffer.cpu;
PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC);
check_fb_attachments(fb);
@ -1351,6 +1498,7 @@ GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
}
return tag.opaque[0];
}
#endif /* PAN_ARCH >= 14 */
#else /* PAN_ARCH == 4 */
static enum mali_color_format
pan_sfbd_raw_format(unsigned bits)
@ -1378,8 +1526,11 @@ GENX(pan_select_tile_size)(struct pan_fb_info *fb)
unsigned
GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
const struct pan_tls_info *tls,
const struct pan_tiler_context *tiler_ctx, void *fbd)
const struct pan_tiler_context *tiler_ctx,
const struct pan_ptr framebuffer)
{
void *fbd = framebuffer.cpu;
PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC);
assert(fb->rt_count <= 1);

View file

@ -196,18 +196,22 @@ pan_wls_adjust_size(unsigned wls_size)
static inline unsigned
pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
const struct pan_kmod_dev_props *props)
const struct pan_kmod_dev_props *props,
unsigned work_reg_count)
{
/* Each shader core can run N tasks and a total of M threads at any single
* time, thus each task should ideally have no more than M/N threads. */
unsigned max_threads_per_task =
props->max_threads_per_core / props->max_tasks_per_core;
ASSERTED unsigned max_threads_per_wg =
pan_compute_max_thread_count(props, work_reg_count);
/* To achieve the best utilization, we should aim for as many workgroups
* per tasks as we can fit without exceeding the above thread limit */
unsigned threads_per_wg =
shader_local_size->x * shader_local_size->y * shader_local_size->z;
assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg);
assert(threads_per_wg > 0 && threads_per_wg <= max_threads_per_wg);
unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
@ -217,14 +221,15 @@ pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
static inline unsigned
pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size,
const struct pan_kmod_dev_props *props,
const struct pan_compute_dim *dim)
const struct pan_compute_dim *dim,
unsigned work_reg_count)
{
/* NOTE: If the instance count is lower than the number of workgroups
* being dispatched, the HW will hold back workgroups until instances
* can be reused. */
unsigned instances;
unsigned wg_per_task =
pan_calc_workgroups_per_task(shader_local_size, props);
pan_calc_workgroups_per_task(shader_local_size, props, work_reg_count);
unsigned max_instances_per_core =
util_next_power_of_two(wg_per_task * props->max_tasks_per_core);
@ -341,7 +346,7 @@ void GENX(pan_emit_afrc_color_attachment)(const struct pan_attachment_info *att,
unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
const struct pan_tls_info *tls,
const struct pan_tiler_context *tiler_ctx,
void *out);
const struct pan_ptr framebuffer);
#if PAN_ARCH >= 6
unsigned GENX(pan_select_tiler_hierarchy_mask)(uint32_t width, uint32_t height,

View file

@ -1,5 +1,6 @@
/*
* Copyright (C) 2026 Collabora, Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
#include "pan_fb.h"
@ -669,9 +670,124 @@ pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode,
}
#endif
#if PAN_ARCH >= 14
uint32_t
GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out)
GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info,
const struct pan_ptr framebuffer)
{
/* Emit the dynamic framebuffer state. That is, state that may change per-layer. */
void *out = framebuffer.cpu;
const struct pan_fb_layout *fb = info->fb;
const struct pan_fb_load *load = info->load;
const struct pan_fb_store *store = info->store;
const struct pan_fb_clean_tile ct = pan_fb_get_clean_tile(info);
const bool has_zs_crc_ext = pan_fb_has_zs(fb);
struct pan_fbd_layer fbd_data = {0};
fbd_data.tiler = info->tiler_ctx->valhall.desc;
/* layer_index in flags0 is used to select the right primitive list in
* the tiler context, and frame_arg is the value that's passed to the
* fragment shader through r62-r63, which we use to pass gl_Layer. Since
* the layer_idx only takes 8-bits, we might use the extra 56-bits we
* have in frame_argument to pass other information to the fragment
* shader at some point.
*/
assert(info->layer >= info->tiler_ctx->valhall.layer_offset);
fbd_data.frame_argument = info->layer;
pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) {
cfg.pre_frame_0 = pan_fix_frame_shader_mode(info->frame_shaders.modes[0],
ct.rts || ct.zs || ct.s);
cfg.pre_frame_1 = pan_fix_frame_shader_mode(info->frame_shaders.modes[1],
ct.rts || ct.zs || ct.s);
cfg.post_frame = info->frame_shaders.modes[2];
/* Enabling prepass without pipelineing is generally not good for
* performance, so disable HSR in that case.
*/
cfg.hsr_prepass_enable = info->allow_hsr_prepass &&
pan_fb_can_pipeline_zs(fb);
cfg.hsr_prepass_interleaving_enable = pan_fb_can_pipeline_zs(fb);
cfg.hsr_prepass_filter_enable = true;
cfg.hsr_hierarchical_optimizations_enable = true;
cfg.internal_layer_index =
info->layer - info->tiler_ctx->valhall.layer_offset;
}
pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) {
if (fb->s_format != PIPE_FORMAT_NONE) {
cfg.s_clear = load && target_has_clear(&load->s) ?
load->s.clear.stencil : 0;
cfg.s_write_enable = store && store->s.store;
}
if (fb->z_format != PIPE_FORMAT_NONE) {
cfg.z_internal_format = pan_get_z_internal_format(fb->z_format);
cfg.z_write_enable = store && store->zs.store;
} else {
cfg.z_internal_format = MALI_Z_INTERNAL_FORMAT_D24;
assert(!store || !store->zs.store);
}
}
fbd_data.z_clear =
util_bitpack_float(fb->z_format != PIPE_FORMAT_NONE && load && load &&
target_has_clear(&load->z)
? load->z.clear.depth
: 0);
fbd_data.dcd_pointer = info->frame_shaders.dcd_pointer;
{
/* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */
uint64_t out_gpu_addr =
framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
if (has_zs_crc_ext) {
fbd_data.dbd_pointer = out_gpu_addr;
assert(fbd_data.dbd_pointer % 64 == 0);
out_gpu_addr += pan_size(ZS_CRC_EXTENSION);
}
fbd_data.rtd_pointer = out_gpu_addr;
assert(fbd_data.rtd_pointer % 64 == 0);
}
memcpy(out, &fbd_data, sizeof(fbd_data));
out += ALIGN_POT(sizeof(fbd_data), 64);
if (has_zs_crc_ext) {
struct mali_zs_crc_extension_packed zs_crc;
emit_zs_crc_desc(info, ct, &zs_crc);
memcpy(out, &zs_crc, sizeof(zs_crc));
out += sizeof(zs_crc);
}
uint32_t tile_rt_offset_B = 0;
for (unsigned rt = 0; rt < fb->rt_count; rt++) {
struct mali_rgb_render_target_packed rgb_rt;
emit_rgb_rt_desc(info, ct, rt, tile_rt_offset_B, &rgb_rt);
memcpy(out, &rgb_rt, sizeof(rgb_rt));
out += sizeof(rgb_rt);
if (fb->rt_formats[rt] != PIPE_FORMAT_NONE) {
tile_rt_offset_B += pan_bytes_per_pixel_tib(fb->rt_formats[rt]) *
fb->tile_size_px * fb->sample_count;
}
}
assert(tile_rt_offset_B <= fb->tile_rt_alloc_B);
return 0;
}
#else /* PAN_ARCH < 14 */
uint32_t
GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info,
const struct pan_ptr framebuffer)
{
void *out = framebuffer.cpu;
const struct pan_fb_layout *fb = info->fb;
const struct pan_fb_load *load = info->load;
const struct pan_fb_store *store = info->store;
@ -823,4 +939,5 @@ GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out)
}
return tag.opaque[0];
}
#endif
#endif /* PAN_ARCH >= 14 */
#endif /* PAN_ARCH >= 5 */

View file

@ -1,14 +1,20 @@
/*
* Copyright (C) 2026 Collabora, Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
#ifndef __PAN_FB_H
#define __PAN_FB_H
#if PAN_ARCH >= 14
#include "genxml/cs_builder.h"
#endif
#include "compiler/shader_enums.h"
#include "genxml/gen_macros.h"
#include "util/format/u_formats.h"
#include "compiler/shader_enums.h"
#include "pan_pool.h"
struct nir_shader;
struct nir_shader_compiler_options;
@ -481,7 +487,7 @@ void GENX(pan_fill_fb_info)(const struct pan_fb_desc_info *info,
struct pan_fb_info *fbinfo);
uint32_t GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info,
void *out);
const struct pan_ptr framebuffer);
#endif
enum ENUM_PACKED pan_fb_shader_op {
@ -620,4 +626,35 @@ GENX(pan_get_fb_shader)(const struct pan_fb_shader_key *key,
const struct nir_shader_compiler_options *nir_options);
#endif
#if PAN_ARCH >= 14
/* Framebuffer per-layer state. Keep this structure 64-byte aligned, since
* we want the adjacent ZS_CRC_EXTENSION and RENDER_TARGET descriptors
* aligned. */
struct pan_fbd_layer {
/** GPU address to the tiler descriptor. */
uint64_t tiler;
/** Frame argument. */
uint64_t frame_argument;
/** An instance of Fragment Flags 0. */
struct mali_fragment_flags_0_packed flags0;
/** An instance of Fragment Flags 2. */
struct mali_fragment_flags_2_packed flags2;
/** Z clear value. */
uint32_t z_clear;
/** GPU address to the draw call descriptors. It may be 0. */
uint64_t dcd_pointer;
/** GPU address to the ZS_CRC_EXTENSION descriptor. It may be 0. */
uint64_t dbd_pointer;
/** GPU address to the RENDER_TARGET descriptors. */
uint64_t rtd_pointer;
} __attribute__((aligned(64)));
#endif /* PAN_ARCH >= 14 */
#endif /* __PAN_FB_H */

View file

@ -1,5 +1,6 @@
/*
* Copyright (C) 2019 Collabora, Ltd.
* Copyright (C) 2026 Arm Ltd.
* SPDX-License-Identifier: MIT
*/
@ -184,7 +185,27 @@ const struct pan_blendable_format
const struct pan_format GENX(pan_pipe_format)[PIPE_FORMAT_COUNT] = {
FMT(NONE, CONSTANT, 0000, L, VTR_IB),
#if PAN_ARCH >= 7
#if PAN_ARCH >= 14
/* Multiplane formats */
FMT_YUV(R8G8_R8B8_UNORM, Y8U8Y8V8_422, UVYA, NO_SWAP, CENTER_422, _T____),
FMT_YUV(G8R8_B8R8_UNORM, U8Y8V8Y8_422, UYVA, SWAP, CENTER_422, _T____),
FMT_YUV(R8B8_R8G8_UNORM, Y8U8Y8V8_422, VYUA, NO_SWAP, CENTER_422, _T____),
FMT_YUV(B8R8_G8R8_UNORM, U8Y8V8Y8_422, VUYA, SWAP, CENTER_422, _T____),
FMT_YUV(R8_G8B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____),
FMT_YUV(R8_B8G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____),
FMT_YUV(R8_G8_B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____),
FMT_YUV(R8_B8_G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____),
FMT_YUV(R8_G8B8_422_UNORM, Y8U8Y8V8_422, YUVA, NO_SWAP, CENTER_422, _T____),
FMT_YUV(R8_B8G8_422_UNORM, U8Y8V8Y8_422, YVUA, NO_SWAP, CENTER_422, _T____),
FMT_YUV(R10_G10B10_420_UNORM, YUYAAYVYAA_420, YUVA, NO_SWAP, CENTER, _T____),
FMT_YUV(R10_G10B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, YUVA, NO_SWAP, CENTER_422, _T____),
/* special internal formats */
FMT_YUV(R8G8B8_420_UNORM_PACKED, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____),
FMT_YUV(R10G10B10_420_UNORM_PACKED, Y10U10V10_420, YUVA, NO_SWAP, CENTER, _T____),
FMT_YUV(X6R10X6G10_X6R10X6B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, UVYA, NO_SWAP, CENTER_422, _T____),
#elif PAN_ARCH >= 7
/* Multiplane formats */
FMT_YUV(R8G8_R8B8_UNORM, YUYV8, UVYA, NO_SWAP, CENTER_422, _T____),
FMT_YUV(G8R8_B8R8_UNORM, VYUY8, UYVA, SWAP, CENTER_422, _T____),

View file

@ -168,6 +168,10 @@ extern const struct pan_blendable_format
pan_blendable_formats_v12[PIPE_FORMAT_COUNT];
extern const struct pan_blendable_format
pan_blendable_formats_v13[PIPE_FORMAT_COUNT];
extern const struct pan_blendable_format
pan_blendable_formats_v14[PIPE_FORMAT_COUNT];
extern const struct pan_blendable_format
pan_blendable_formats_v15[PIPE_FORMAT_COUNT];
uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats);
@ -184,6 +188,8 @@ pan_blendable_format_table(unsigned arch)
FMT_TABLE(10);
FMT_TABLE(12);
FMT_TABLE(13);
FMT_TABLE(14);
FMT_TABLE(15);
#undef FMT_TABLE
default:
assert(!"Unsupported architecture");
@ -199,6 +205,8 @@ extern const struct pan_format pan_pipe_format_v9[PIPE_FORMAT_COUNT];
extern const struct pan_format pan_pipe_format_v10[PIPE_FORMAT_COUNT];
extern const struct pan_format pan_pipe_format_v12[PIPE_FORMAT_COUNT];
extern const struct pan_format pan_pipe_format_v13[PIPE_FORMAT_COUNT];
extern const struct pan_format pan_pipe_format_v14[PIPE_FORMAT_COUNT];
extern const struct pan_format pan_pipe_format_v15[PIPE_FORMAT_COUNT];
static inline const struct pan_format *
pan_format_table(unsigned arch)
@ -213,6 +221,8 @@ pan_format_table(unsigned arch)
FMT_TABLE(10);
FMT_TABLE(12);
FMT_TABLE(13);
FMT_TABLE(14);
FMT_TABLE(15);
#undef FMT_TABLE
default:
assert(!"Unsupported architecture");

View file

@ -84,6 +84,8 @@ const struct pan_mod_handler *pan_mod_get_handler_v9(uint64_t modifier);
const struct pan_mod_handler *pan_mod_get_handler_v10(uint64_t modifier);
const struct pan_mod_handler *pan_mod_get_handler_v12(uint64_t modifier);
const struct pan_mod_handler *pan_mod_get_handler_v13(uint64_t modifier);
const struct pan_mod_handler *pan_mod_get_handler_v14(uint64_t modifier);
const struct pan_mod_handler *pan_mod_get_handler_v15(uint64_t modifier);
static inline const struct pan_mod_handler *
pan_mod_get_handler(unsigned arch, uint64_t modifier)
@ -105,6 +107,10 @@ pan_mod_get_handler(unsigned arch, uint64_t modifier)
return pan_mod_get_handler_v12(modifier);
case 13:
return pan_mod_get_handler_v13(modifier);
case 14:
return pan_mod_get_handler_v14(modifier);
case 15:
return pan_mod_get_handler_v15(modifier);
default:
UNREACHABLE("Unsupported arch");
}

View file

@ -70,6 +70,15 @@ pan_compute_max_thread_count(const struct pan_kmod_dev_props *props,
aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
}
if (pan_arch(props->gpu_id) >= 15) {
assert(props->num_threads_active_granularity);
unsigned max_treads_per_wg =
ROUND_DOWN_TO(props->num_registers_per_core / aligned_reg_count,
props->num_threads_active_granularity);
return MIN2(max_treads_per_wg, props->max_threads_per_core);
}
assert(props->max_threads_per_wg);
return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
props->num_registers_per_core / aligned_reg_count);
}

View file

@ -223,6 +223,25 @@ pan_clump_format(enum pipe_format format)
/* YUV-sampling has special cases */
if (pan_format_is_yuv(format)) {
switch (format) {
#if PAN_ARCH >= 14
case PIPE_FORMAT_R8G8_R8B8_UNORM:
case PIPE_FORMAT_G8R8_B8R8_UNORM:
case PIPE_FORMAT_R8B8_R8G8_UNORM:
case PIPE_FORMAT_B8R8_G8R8_UNORM:
case PIPE_FORMAT_R8_G8B8_422_UNORM:
case PIPE_FORMAT_R8_B8G8_422_UNORM:
case PIPE_FORMAT_R8_G8B8_420_UNORM:
case PIPE_FORMAT_R8_B8G8_420_UNORM:
case PIPE_FORMAT_R8_G8_B8_420_UNORM:
case PIPE_FORMAT_R8_B8_G8_420_UNORM:
case PIPE_FORMAT_R8G8B8_420_UNORM_PACKED:
return MALI_CLUMP_FORMAT_RAW8;
case PIPE_FORMAT_R10_G10B10_420_UNORM:
case PIPE_FORMAT_R10G10B10_420_UNORM_PACKED:
case PIPE_FORMAT_R10_G10B10_422_UNORM:
case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM:
return MALI_CLUMP_FORMAT_R10_PACKED;
#else
case PIPE_FORMAT_R8G8_R8B8_UNORM:
case PIPE_FORMAT_G8R8_B8R8_UNORM:
case PIPE_FORMAT_R8B8_R8G8_UNORM:
@ -242,6 +261,7 @@ pan_clump_format(enum pipe_format format)
case PIPE_FORMAT_R10_G10B10_422_UNORM:
case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM:
return MALI_CLUMP_FORMAT_Y10_UV10_422;
#endif /* PAN_ARCH >= 14 */
default:
UNREACHABLE("unhandled clump format");
}

View file

@ -28,6 +28,10 @@
#include "libpan_v12.h"
#elif (PAN_ARCH == 13)
#include "libpan_v13.h"
#elif (PAN_ARCH == 14)
#include "libpan_v14.h"
#elif (PAN_ARCH == 15)
#include "libpan_v15.h"
#else
#error "Unsupported architecture for libpan"
#endif

View file

@ -26,6 +26,10 @@
#include "libpan_shaders_v12.h"
#elif (PAN_ARCH == 13)
#include "libpan_shaders_v13.h"
#elif (PAN_ARCH == 14)
#include "libpan_shaders_v14.h"
#elif (PAN_ARCH == 15)
#include "libpan_shaders_v15.h"
#else
#error "Unsupported architecture for libpan"
#endif

View file

@ -11,7 +11,7 @@ libpan_shader_files = files(
idep_libpan_per_arch = {}
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13']
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
libpan_spv = custom_target(
input : libpan_shader_files,
output : 'libpan_v' + ver + '.spv',

View file

@ -95,6 +95,14 @@ const struct pan_model pan_model_list[] = {
MODEL_RATES(4, 8, 128)),
FIFTHGEN_MODEL(PAN_PROD_ID(13, 8, 0), 4, "G725", "TKRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
MODEL_RATES(4, 8, 128)),
FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 1, "G1-Pro", "TDRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
MODEL_RATES(4, 8, 64)),
FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 4, "G1-Pro", "TDRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
MODEL_RATES(4, 8, 128)),
FIFTHGEN_MODEL(PAN_PROD_ID(15, 8, 3), 0, "TMAx", "TMAx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
MODEL_RATES(4, 8, 64)),
FIFTHGEN_MODEL(PAN_PROD_ID(15, 8, 3), 4, "TMAx", "TMAx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
MODEL_RATES(4, 8, 128)),
};
/* clang-format on */

View file

@ -31,6 +31,15 @@ struct pan_tiler_features {
#define PAN_VERSION_MINOR(x) (((x) & BITFIELD_RANGE(4, 8)) >> 4)
#define PAN_VERSION_STATUS(x) ((x) & BITFIELD_RANGE(0, 4))
#define PAN_ID64_COMPAT 0xFull
#define PAN_ID64_ARCH_MAJOR(x) (((x) & BITFIELD64_RANGE(56, 8)) >> 56)
#define PAN_ID64_ARCH_MINOR(x) (((x) & BITFIELD64_RANGE(48, 8)) >> 48)
#define PAN_ID64_ARCH_REV(x) (((x) & BITFIELD64_RANGE(40, 8)) >> 40)
#define PAN_ID64_PRODUCT_MAJOR(x) (((x) & BITFIELD64_RANGE(32, 8)) >> 32)
#define PAN_ID64_VERSION_MAJOR(x) (((x) & BITFIELD64_RANGE(16, 8)) >> 16)
#define PAN_ID64_VERSION_MINOR(x) (((x) & BITFIELD64_RANGE(8, 8)) >> 8)
#define PAN_ID64_VERSION_STATUS(x) ((x) & BITFIELD64_RANGE(0, 8))
/* GPU product id for Midgard */
#define MIDGARD_PROD_ID(x) (((x) & BITFIELD_RANGE(16, 16)) >> 16)
@ -108,8 +117,12 @@ pan_arch(uint64_t gpu_id)
case 0x860:
case 0x880:
return 5;
default:
return PAN_ARCH_MAJOR(gpu_id);
default: {
unsigned gpu_arch = PAN_ARCH_MAJOR(gpu_id);
if (gpu_arch == PAN_ID64_COMPAT)
return PAN_ID64_ARCH_MAJOR(gpu_id);
return gpu_arch;
}
}
}
@ -119,14 +132,21 @@ pan_prod_id(uint64_t gpu_id)
unsigned arch = pan_arch(gpu_id);
if (arch < 6)
return MIDGARD_PROD_ID(gpu_id);
return PAN_PROD_ID(PAN_ARCH_MAJOR(gpu_id), PAN_ARCH_MINOR(gpu_id),
PAN_PRODUCT_MAJOR(gpu_id));
else if (arch < PAN_ID64_COMPAT)
return PAN_PROD_ID(PAN_ARCH_MAJOR(gpu_id), PAN_ARCH_MINOR(gpu_id),
PAN_PRODUCT_MAJOR(gpu_id));
return PAN_PROD_ID(PAN_ID64_ARCH_MAJOR(gpu_id), PAN_ID64_ARCH_MINOR(gpu_id),
PAN_ID64_PRODUCT_MAJOR(gpu_id));
}
static inline uint32_t
pan_rev(uint64_t gpu_id)
{
return PAN_REV(PAN_VERSION_MAJOR(gpu_id), PAN_VERSION_MINOR(gpu_id));
unsigned arch = pan_arch(gpu_id);
if (arch < PAN_ID64_COMPAT)
return PAN_REV(PAN_VERSION_MAJOR(gpu_id), PAN_VERSION_MINOR(gpu_id));
return PAN_REV(PAN_ID64_VERSION_MAJOR(gpu_id),
PAN_ID64_VERSION_MINOR(gpu_id));
}
#endif

View file

@ -74,7 +74,11 @@ static inline uint32_t
get_fbd_size(bool has_zs_ext, uint32_t rt_count)
{
assert(rt_count >= 1 && rt_count <= MAX_RTS);
#if PAN_ARCH >= 14
uint32_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
#else
uint32_t fbd_size = pan_size(FRAMEBUFFER);
#endif
if (has_zs_ext)
fbd_size += pan_size(ZS_CRC_EXTENSION);
fbd_size += pan_size(RENDER_TARGET) * rt_count;
@ -209,13 +213,27 @@ enum panvk_cs_regs {
PANVK_CS_REG_RUN_IDVS_SR_END = 60,
#endif
#if PAN_ARCH >= 14
/* RUN_FRAGMENT2 staging regs.
* SW ABI:
* - r54:55 contain the pointer to the current FBD layer state.
* - r58:59 contain the pointer to the first tiler descriptor. This is
* needed to gather completed heap chunks after a run_fragment2.
*/
PANVK_CS_REG_RUN_FRAGMENT_SR_START = 0,
PANVK_CS_REG_RUN_FRAGMENT_SR_END = 55,
PANVK_CS_REG_FBD_LAYER_PTR = 54,
PANVK_CS_REG_TILER_DESC_PTR = 58,
#else
/* RUN_FRAGMENT staging regs.
* SW ABI:
* - r38:39 contain the pointer to the first tiler descriptor. This is
* - r58:59 contain the pointer to the first tiler descriptor. This is
* needed to gather completed heap chunks after a run_fragment.
*/
PANVK_CS_REG_RUN_FRAGMENT_SR_START = 38,
PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46,
PANVK_CS_REG_TILER_DESC_PTR = 58,
#endif
/* RUN_COMPUTE staging regs. */
PANVK_CS_REG_RUN_COMPUTE_SR_START = 0,
@ -870,4 +888,31 @@ vk_stages_to_subqueue_mask(VkPipelineStageFlags2 vk_stages,
void panvk_per_arch(emit_barrier)(struct panvk_cmd_buffer *cmdbuf,
struct panvk_cs_deps deps);
#if PAN_ARCH >= 14
static inline void
cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr)
{
/* Emit the dynamic fragment state. This state may change per-layer. */
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr,
offsetof(struct pan_fbd_layer, flags0));
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr,
offsetof(struct pan_fbd_layer, flags2));
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr,
offsetof(struct pan_fbd_layer, z_clear));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, tiler));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, rtd_pointer));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, dbd_pointer));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr,
offsetof(struct pan_fbd_layer, frame_argument));
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr,
offsetof(struct pan_fbd_layer, dcd_pointer));
cs_flush_loads(b);
}
#endif /* PAN_ARCH >= 14 */
#endif /* PANVK_CMD_BUFFER_H */

View file

@ -89,8 +89,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
unsigned core_id_range;
pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
tlsinfo.wls.instances = pan_calc_wls_instances(
&cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
tlsinfo.wls.instances =
pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
indirect ? NULL : dim, cs->info.work_reg_count);
unsigned wls_total_size = pan_calc_total_wls_size(
tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range);
@ -156,7 +157,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
unsigned wg_per_task = 0;
if (indirect)
wg_per_task = pan_calc_workgroups_per_task(&cs->cs.local_size,
&phys_dev->kmod.dev->props);
&phys_dev->kmod.dev->props,
cs->info.work_reg_count);
if (compute_state_dirty(cmdbuf, DESC_STATE) ||
compute_state_dirty(cmdbuf, CS)) {
@ -207,9 +209,20 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_FAU), fau_ptr);
}
if (compute_state_dirty(cmdbuf, CS))
if (compute_state_dirty(cmdbuf, CS)) {
#if PAN_ARCH >= 15
struct mali_shader_program_pointer_packed spp;
pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
ctx.register_count = cs->info.work_reg_count;
ctx.pointer = panvk_priv_mem_dev_addr(cs->spd);
}
uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD), ptr);
#else
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD),
panvk_priv_mem_dev_addr(cs->spd));
#endif
}
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_TSD), tsd);

View file

@ -51,6 +51,7 @@
#include "vk_render_pass.h"
#include "poly/geometry.h"
#if PAN_ARCH < 14
static enum cs_reg_perm
provoking_vertex_fn_reg_perm_cb(struct cs_builder *b, unsigned reg)
{
@ -202,6 +203,7 @@ panvk_per_arch(device_draw_context_cleanup)(struct panvk_device *dev)
panvk_priv_bo_unref(dev->draw_ctx->fns_bo);
vk_free(&dev->vk.alloc, dev->draw_ctx);
}
#endif /* PAN_ARCH < 14 */
static void
emit_vs_attrib(struct panvk_cmd_buffer *cmdbuf,
@ -1245,8 +1247,13 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
uint32_t fbd_sz = calc_fbd_size(cmdbuf);
uint32_t fbds_sz = enabled_layer_count * fbd_sz;
cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
#if PAN_ARCH >= 14
const unsigned fbds_alignment = alignof(struct pan_fbd_layer);
#else
const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER);
#endif
cmdbuf->state.gfx.render.fbds =
panvk_cmd_alloc_dev_mem(cmdbuf, desc, fbds_sz, fbds_alignment);
if (!cmdbuf->state.gfx.render.fbds.gpu)
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
@ -1316,14 +1323,23 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
tiler_ctx = get_tiler_context(cmdbuf, layer_idx);
uint32_t new_fbd_flags =
GENX(pan_emit_fb_desc)(&fbd_info, fbds.cpu + fbd_sz * i);
GENX(pan_emit_fb_desc)(&fbd_info, pan_ptr_offset(fbds, fbd_sz * i));
/* Make sure all FBDs have the same flags. */
assert(i == 0 || new_fbd_flags == fbd_flags);
fbd_flags = new_fbd_flags;
}
#if PAN_ARCH >= 14
/* fbd_flags is unused on v14+. */
assert(!fbd_flags);
#endif
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
#if PAN_ARCH >= 14
// TODO: Implement IR support for v14.
#else
for (uint32_t ir_pass = 0; ir_pass < PANVK_IR_PASS_COUNT; ir_pass++) {
struct pan_ptr ir_fbds = panvk_cmd_alloc_dev_mem(
cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
@ -1335,7 +1351,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
for (uint32_t i = 0; i < enabled_layer_count; i++) {
uint32_t layer_idx = multiview ? u_bit_scan(&ir_view_mask_temp) : i;
void *ir_fbd = (void *)((uint8_t *)ir_fbds.cpu + (i * fbd_sz));
fbd_info.layer = layer_idx;
tiler_ctx = get_tiler_context(cmdbuf, layer_idx);
@ -1353,8 +1368,8 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
if (result != VK_SUCCESS)
return result;
ASSERTED uint32_t new_fbd_flags =
GENX(pan_emit_fb_desc)(&fbd_info, ir_fbd);
ASSERTED uint32_t new_fbd_flags = GENX(pan_emit_fb_desc)(
&fbd_info, pan_ptr_offset(ir_fbds, fbd_sz * i));
/* Make sure all FBDs have the same flags. */
assert(new_fbd_flags == fbd_flags);
@ -1367,16 +1382,18 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
/* Wait for IR info push to complete */
cs_wait_slot(b, SB_ID(LS));
bool unset_provoking_vertex =
cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET;
#endif /* PAN_ARCH >= 14 */
if (copy_fbds) {
struct cs_index cur_tiler = cs_reg64(b, 38);
struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR);
#if PAN_ARCH >= 14
struct cs_index dst_fbd_ptr = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
#else
struct cs_index dst_fbd_ptr = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
struct cs_index fbd_idx = cs_reg32(b, 47);
struct cs_index src_fbd_ptr = cs_reg64(b, 48);
struct cs_index remaining_layers_in_td = cs_reg32(b, 50);
#endif
struct cs_index fbd_idx = cs_reg32(b, 60);
struct cs_index src_fbd_ptr = cs_reg64(b, 64);
struct cs_index remaining_layers_in_td = cs_reg32(b, 61);
uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
MAX_LAYERS_PER_TILER_DESC);
@ -1400,10 +1417,27 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
* framebuffer size is aligned on 64-bytes. */
assert(fbd_sz == ALIGN_POT(fbd_sz, 64));
#if PAN_ARCH >= 14
for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr,
BITFIELD_MASK(16), fbd_off);
/* Patch the Tiler pointer. */
if (fbd_off == 0)
cs_add64(b, cs_scratch_reg64(b, 0), cur_tiler, 0);
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr,
BITFIELD_MASK(16), fbd_off);
}
#else
bool unset_provoking_vertex =
cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET;
for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
if (fbd_off == 0) {
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14), src_fbd_ptr,
BITFIELD_MASK(14), fbd_off);
/* Patch the Tiler pointer. */
cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0);
/* If we don't know what provoking vertex mode the
@ -1423,6 +1457,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr,
BITFIELD_MASK(16), fbd_off);
}
#endif
/* Finish stores to pass_dst_fbd_ptr. */
cs_flush_stores(b);
@ -1456,12 +1491,19 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
-(full_td_count * pan_size(TILER_CONTEXT)));
}
} else {
#if PAN_ARCH >= 14
struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
#else
struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
#endif
cs_update_frag_ctx(b) {
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
fbds.gpu | fbd_flags);
cs_move64_to(b, cs_reg64(b, 38), cmdbuf->state.gfx.render.tiler);
cs_move64_to(b, fbd_pointer, fbds.gpu | fbd_flags);
cs_move64_to(b, cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR),
cmdbuf->state.gfx.render.tiler);
}
#if PAN_ARCH < 14
/* If we don't know what provoking vertex mode the application wants yet,
* leave space to patch it later */
if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) {
@ -1483,6 +1525,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex)
cs_call(b, addr_reg, length_reg);
}
#endif
}
return VK_SUCCESS;
@ -3299,6 +3342,9 @@ calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf)
static void
setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
{
#if PAN_ARCH >= 14
// TODO: Implement IR support for v14.
#else
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
const bool has_zs_ext = pan_fb_has_zs(fb);
@ -3343,6 +3389,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
TILER_OOM_CTX_FIELD_OFFSET(layer_count));
cs_flush_stores(b);
#endif /* PAN_ARCH >= 14 */
}
static uint32_t
@ -3351,17 +3398,98 @@ pack_32_2x16(uint16_t lo, uint16_t hi)
return (((uint32_t)hi) << 16) | (uint32_t)lo;
}
#if PAN_ARCH >= 14
static void
cs_emit_static_fragment_state(struct cs_builder *b,
struct panvk_cmd_buffer *cmdbuf)
{
/* Emit the static fragment staging registers. These don't change per-layer. */
const struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
const struct panvk_rendering_state *render = &cmdbuf->state.gfx.render;
const struct pan_fb_layout *fb = &render->fb.layout;
const uint8_t sample_count = render->fb.layout.sample_count;
const struct pan_fb_bbox fb_area_px =
pan_fb_bbox_from_xywh(0, 0, fb->width_px, fb->height_px);
const struct pan_fb_bbox bbox_px =
pan_fb_bbox_clamp(fb->tiling_area_px, fb_area_px);
assert(pan_fb_bbox_is_valid(fb->tiling_area_px));
struct mali_fragment_bounding_box_packed bbox;
pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) {
cfg.bound_min_x = bbox_px.min_x;
cfg.bound_min_y = bbox_px.min_y;
cfg.bound_max_x = bbox_px.max_x;
cfg.bound_max_y = bbox_px.max_y;
}
struct mali_frame_size_packed frame_size;
pan_pack(&frame_size, FRAME_SIZE, cfg) {
cfg.width = fb->width_px;
cfg.height = fb->height_px;
}
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX),
bbox.opaque[0] | (uint64_t)bbox.opaque[1] << 32);
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]);
cs_move64_to(
b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER),
dev->sample_positions->addr.dev +
pan_sample_positions_offset(pan_sample_pattern(sample_count)));
/* Flags 1 */
struct mali_fragment_flags_1_packed flags1;
pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) {
cfg.sample_count = fb->sample_count;
cfg.sample_pattern = pan_sample_pattern(fb->sample_count);
cfg.effective_tile_size = fb->tile_size_px;
cfg.point_sprite_coord_origin_max_y = false;
cfg.first_provoking_vertex = get_first_provoking_vertex(cmdbuf);
assert(fb->rt_count > 0);
cfg.render_target_count = fb->rt_count;
cfg.color_buffer_allocation = fb->tile_rt_alloc_B;
}
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]);
/* If we don't know what provoking vertex mode the application wants yet,
* leave space to patch it later */
if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) {
cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex)
{
/* provoking_vertex flag is bit 14 of Fragment Flags 1. */
cs_add32(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1),
cs_sr_reg32(b, FRAGMENT, FLAGS_1), -(1 << 14));
}
}
/* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */
}
#endif /* PAN_ARCH >= 14 */
static VkResult
issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
{
#if PAN_ARCH < 14
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
#endif
const struct cs_tracing_ctx *tracing_ctx =
&cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing;
const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
/* Now initialize the fragment bits. */
#if PAN_ARCH >= 14
struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
cs_update_frag_ctx(b) {
cs_emit_static_fragment_state(b, cmdbuf);
cs_emit_layer_fragment_state(b, fbd_pointer);
}
#else
const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
cs_update_frag_ctx(b) {
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
pack_32_2x16(fb->tiling_area_px.min_x,
@ -3370,6 +3498,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
pack_32_2x16(fb->tiling_area_px.max_x,
fb->tiling_area_px.max_y));
}
#endif
bool simul_use =
cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
@ -3401,6 +3530,9 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
* state for this renderpass, so it's safe to enable. */
struct cs_index addr_reg = cs_scratch_reg64(b, 0);
struct cs_index length_reg = cs_scratch_reg32(b, 2);
#if PAN_ARCH >= 14
// TODO: Implement IR support for v14.
#else
uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf);
uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev +
handler_idx * dev->tiler_oom.handler_stride;
@ -3408,6 +3540,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride);
cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
length_reg);
#endif
/* Wait for the tiling to be done before submitting the fragment job. */
wait_finish_tiling(cmdbuf);
@ -3422,8 +3555,12 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
* up. */
cs_move64_to(b, addr_reg, 0);
cs_move32_to(b, length_reg, 0);
#if PAN_ARCH >= 14
// TODO: Implement IR support for v14.
#else
cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
length_reg);
#endif
/* Applications tend to forget to describe subpass dependencies, especially
* when it comes to write -> read dependencies on attachments. The
@ -3439,8 +3576,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
}
if (cmdbuf->state.gfx.render.layer_count <= 1) {
#if PAN_ARCH >= 14
cs_trace_run_fragment2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
false, MALI_TILE_RENDER_ORDER_Z_ORDER);
#else
cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
false, MALI_TILE_RENDER_ORDER_Z_ORDER);
#endif
} else {
struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 4);
struct cs_index remaining_layers = cs_scratch_reg32(b, 4);
@ -3449,12 +3591,18 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
cs_while(b, MALI_CS_CONDITION_GREATER, remaining_layers) {
cs_add32(b, remaining_layers, remaining_layers, -1);
#if PAN_ARCH >= 14
cs_emit_layer_fragment_state(b, fbd_pointer);
cs_trace_run_fragment2(b, tracing_ctx, run_fragment_regs, false,
MALI_TILE_RENDER_ORDER_Z_ORDER);
#else
cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false,
MALI_TILE_RENDER_ORDER_Z_ORDER);
struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
#endif
cs_update_frag_ctx(b)
cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbd_sz);
cs_add64(b, fbd_pointer, fbd_pointer, fbd_sz);
}
}
@ -3468,8 +3616,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
struct cs_index completed_top = cs_scratch_reg64(b, 10);
struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
struct cs_index cur_tiler = cs_reg64(b, 38);
struct cs_index tiler_count = cs_reg32(b, 47);
struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR);
struct cs_index tiler_count = cs_reg32(b, 60);
struct cs_index oq_chain = cs_scratch_reg64(b, 10);
struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10);
struct cs_index oq_syncobj = cs_scratch_reg64(b, 12);

View file

@ -82,8 +82,18 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_FAU), fau_ptr);
#if PAN_ARCH >= 15
struct mali_shader_program_pointer_packed spp;
pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
ctx.register_count = shader->info.work_reg_count;
ctx.pointer = panvk_priv_mem_dev_addr(shader->spd);
}
uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD), ptr);
#else
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD),
panvk_priv_mem_dev_addr(shader->spd));
#endif
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_TSD), tsd);
@ -155,7 +165,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
* increment/axis parameters requires knowledge of job dimensions, but
* this is somewhat offset by run_compute being a native instruction. */
task_increment = pan_calc_workgroups_per_task(
&shader->cs.local_size, &phys_dev->kmod.dev->props);
&shader->cs.local_size, &phys_dev->kmod.dev->props,
shader->info.work_reg_count);
} else {
panvk_per_arch(calculate_task_axis_and_increment)(
shader, phys_dev, &dim, &task_axis, &task_increment);

View file

@ -13,8 +13,13 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg)
{
switch (reg) {
/* The bbox is set up by the fragment subqueue, we should not modify it. */
#if PAN_ARCH >= 14
case 28:
case 29:
#else
case 42:
case 43:
#endif
/* We should only load from the subqueue context. */
case PANVK_CS_REG_SUBQUEUE_CTX_START:
case PANVK_CS_REG_SUBQUEUE_CTX_END:
@ -42,8 +47,14 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count,
cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, BITFIELD_MASK(8),
8 * sizeof(uint32_t));
#if PAN_ARCH >= 14
const size_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
#else
const size_t fbd_size = sizeof(struct mali_framebuffer_packed);
#endif
if (has_zs_ext) {
const uint16_t dbd_offset = sizeof(struct mali_framebuffer_packed);
const uint16_t dbd_offset = fbd_size;
/* Copy the whole DBD. */
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other,
@ -57,8 +68,7 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count,
}
const uint16_t rts_offset =
sizeof(struct mali_framebuffer_packed) +
(has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0);
fbd_size + (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0);
for (uint32_t rt = 0; rt < rt_count; rt++) {
const uint16_t rt_offset =
@ -110,12 +120,14 @@ generate_tiler_oom_handler(struct panvk_device *dev,
.tracebuf_addr_offset =
offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
};
struct mali_framebuffer_pointer_packed fb_tag;
#if PAN_ARCH < 14
struct mali_framebuffer_pointer_packed fb_tag;
pan_pack(&fb_tag, FRAMEBUFFER_POINTER, cfg) {
cfg.zs_crc_extension_present = has_zs_ext;
cfg.render_target_count = rt_count;
}
#endif
cs_function_def(&b, &handler, handler_ctx) {
struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b);
@ -140,7 +152,7 @@ generate_tiler_oom_handler(struct panvk_device *dev,
struct cs_index run_fragment_regs = cs_scratch_reg_tuple(&b, 0, 4);
/* The tiler pointer is pre-filled. */
struct cs_index tiler_ptr = cs_reg64(&b, 38);
struct cs_index tiler_ptr = cs_reg64(&b, PANVK_CS_REG_TILER_DESC_PTR);
cs_load64_to(&b, scratch_fbd_ptr_reg, subqueue_ctx,
TILER_OOM_CTX_FIELD_OFFSET(ir_scratch_fbd_ptr));
@ -175,12 +187,22 @@ generate_tiler_oom_handler(struct panvk_device *dev,
/* Flush copies before the RUN_FRAGMENT. */
cs_wait_slot(&b, SB_ID(LS));
#if PAN_ARCH >= 14
/* Set FBD pointer to the scratch fbd */
struct cs_index fbd_pointer = cs_reg64(&b, PANVK_CS_REG_FBD_LAYER_PTR);
cs_add64(&b, fbd_pointer, scratch_fbd_ptr_reg, 0);
cs_emit_layer_fragment_state(&b, fbd_pointer);
cs_trace_run_fragment2(&b, &tracing_ctx, run_fragment_regs, false,
MALI_TILE_RENDER_ORDER_Z_ORDER);
#else
/* Set FBD pointer to the scratch fbd */
cs_add64(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER),
scratch_fbd_ptr_reg, fb_tag.opaque[0]);
cs_trace_run_fragment(&b, &tracing_ctx, run_fragment_regs, false,
MALI_TILE_RENDER_ORDER_Z_ORDER);
#endif
/* Serialize run fragments since we reuse FBD for the runs */
cs_wait_slots(&b, dev->csf.sb.all_iters_mask);

View file

@ -717,7 +717,12 @@ init_tiler(struct panvk_gpu_queue *queue)
tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size;
alloc_info.size = get_fbd_size(true, MAX_RTS);
alloc_info.alignment = pan_alignment(FRAMEBUFFER);
#if PAN_ARCH >= 14
const unsigned fbds_alignment = alignof(struct pan_fbd_layer);
#else
const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER);
#endif
alloc_info.alignment = fbds_alignment;
tiler_heap->oom_fbd = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
if (!panvk_priv_mem_check_alloc(tiler_heap->oom_fbd)) {
result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,

View file

@ -181,7 +181,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
fbd_info.layer = layer_id;
fbd_info.frame_shaders = fs;
fbd_info.frame_shaders.dcd_pointer += layer_id * 3 * pan_size(DRAW);
tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd.cpu);
tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd);
result = panvk_cmd_prepare_fragment_job(cmdbuf, tagged_fbd_ptr);
if (result != VK_SUCCESS)

View file

@ -51,8 +51,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
unsigned core_id_range;
pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
batch->tlsinfo.wls.instances = pan_calc_wls_instances(
&cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
batch->tlsinfo.wls.instances =
pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
indirect ? NULL : dim, cs->info.work_reg_count);
batch->wls_total_size = pan_calc_total_wls_size(
batch->tlsinfo.wls.size, batch->tlsinfo.wls.instances, core_id_range);
}

View file

@ -14,6 +14,7 @@ panvk_entrypoints = custom_target(
'--device-prefix', 'panvk_v6', '--device-prefix', 'panvk_v7',
'--device-prefix', 'panvk_v9', '--device-prefix', 'panvk_v10',
'--device-prefix', 'panvk_v12', '--device-prefix', 'panvk_v13',
'--device-prefix', 'panvk_v14', '--device-prefix', 'panvk_v15',
'--beta', with_vulkan_beta.to_string()
],
depend_files : vk_entrypoints_gen_depend_files,
@ -65,7 +66,7 @@ valhall_archs = [9, 10]
valhall_inc_dir = ['valhall']
valhall_files = []
fifthgen_archs = [12, 13]
fifthgen_archs = [12, 13, 14, 15]
fifthgen_inc_dir = ['fifthgen']
fifthgen_files = []
@ -83,7 +84,7 @@ jm_files = [
'jm/panvk_vX_gpu_queue.c',
]
csf_archs = [10, 12, 13]
csf_archs = [10, 12, 13, 14, 15]
csf_inc_dir = ['csf']
csf_files = [
'csf/panvk_vX_bind_queue.c',
@ -126,7 +127,7 @@ common_per_arch_files = [
sha1_h,
]
foreach arch : [6, 7, 10, 12, 13]
foreach arch : [6, 7, 10, 12, 13, 14, 15]
per_arch_files = common_per_arch_files
inc_panvk_per_arch = []

View file

@ -243,7 +243,7 @@ struct panvk_cmd_graphics_state {
} \
} while (0)
#if PAN_ARCH >= 10
#if PAN_ARCH >= 10 && PAN_ARCH < 14
struct panvk_device_draw_context {
struct panvk_priv_bo *fns_bo;
uint64_t fn_set_fbds_provoking_vertex_stride;
@ -376,8 +376,7 @@ cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state,
gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \
} while (0)
#if PAN_ARCH >= 10
#if PAN_ARCH >= 10 && PAN_ARCH < 14
VkResult
panvk_per_arch(device_draw_context_init)(struct panvk_device *dev);

View file

@ -61,6 +61,12 @@ panvk_catch_indirect_alloc_failure(VkResult error)
case 13: \
panvk_arch_name(name, v13)(__VA_ARGS__); \
break; \
case 14: \
panvk_arch_name(name, v14)(__VA_ARGS__); \
break; \
case 15: \
panvk_arch_name(name, v15)(__VA_ARGS__); \
break; \
default: \
UNREACHABLE("Unsupported architecture"); \
} \
@ -84,6 +90,12 @@ panvk_catch_indirect_alloc_failure(VkResult error)
case 13: \
ret = panvk_arch_name(name, v13)(__VA_ARGS__); \
break; \
case 14: \
ret = panvk_arch_name(name, v14)(__VA_ARGS__); \
break; \
case 15: \
ret = panvk_arch_name(name, v15)(__VA_ARGS__); \
break; \
default: \
UNREACHABLE("Unsupported architecture"); \
} \
@ -102,6 +114,10 @@ panvk_catch_indirect_alloc_failure(VkResult error)
#define panvk_per_arch(name) panvk_arch_name(name, v12)
#elif PAN_ARCH == 13
#define panvk_per_arch(name) panvk_arch_name(name, v13)
#elif PAN_ARCH == 14
#define panvk_per_arch(name) panvk_arch_name(name, v14)
#elif PAN_ARCH == 15
#define panvk_per_arch(name) panvk_arch_name(name, v15)
#else
#error "Unsupported arch"
#endif

View file

@ -64,6 +64,8 @@ PER_ARCH_FUNCS(7);
PER_ARCH_FUNCS(10);
PER_ARCH_FUNCS(12);
PER_ARCH_FUNCS(13);
PER_ARCH_FUNCS(14);
PER_ARCH_FUNCS(15);
static VkResult
create_kmod_dev(struct panvk_physical_device *device,
@ -411,6 +413,8 @@ panvk_physical_device_init(struct panvk_physical_device *device,
switch (arch) {
case 6:
case 7:
case 14:
case 15:
if (!os_get_option("PAN_I_WANT_A_BROKEN_VULKAN_DRIVER")) {
result = panvk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
"WARNING: panvk is not well-tested on v%d, "

View file

@ -239,10 +239,15 @@ get_frame_shader(struct panvk_device *dev,
panvk_priv_mem_write_desc(shader->spd, 0, SHADER_PROGRAM, cfg) {
cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
#if PAN_ARCH >= 15
cfg.register_count = shader->info.work_reg_count;
cfg.preload.r0_r15 = shader->info.preload;
#else
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem);
cfg.preload.r48_r63 = shader->info.preload >> 48;
#endif
cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem);
}
#endif

View file

@ -550,7 +550,7 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device,
goto err_free_precomp;
}
#if PAN_ARCH >= 10
#if PAN_ARCH >= 10 && PAN_ARCH < 14
result = panvk_per_arch(device_draw_context_init)(device);
if (result != VK_SUCCESS)
goto err_free_mem_cache;
@ -616,7 +616,7 @@ err_finish_queues:
panvk_meta_cleanup(device);
err_free_draw_ctx:
#if PAN_ARCH >= 10
#if PAN_ARCH >= 10 && PAN_ARCH < 14
panvk_per_arch(device_draw_context_cleanup)(device);
err_free_mem_cache:
#endif
@ -679,7 +679,7 @@ panvk_per_arch(destroy_device)(struct panvk_device *device,
}
panvk_precomp_cleanup(device);
#if PAN_ARCH >= 10
#if PAN_ARCH >= 10 && PAN_ARCH < 14
panvk_per_arch(device_draw_context_cleanup)(device);
#endif
panvk_meta_cleanup(device);

View file

@ -732,6 +732,18 @@ get_conformance_version()
return (VkConformanceVersion){0, 0, 0, 0};
}
static uint32_t
get_device_id(uint64_t gpu_id)
{
if (PAN_ARCH >= PAN_ID64_COMPAT)
return ((PAN_ID64_COMPAT << 28) | (PAN_ID64_ARCH_MAJOR(gpu_id) << 20) |
(PAN_ID64_ARCH_MINOR(gpu_id) << 12) |
((PAN_ID64_PRODUCT_MAJOR(gpu_id) & 0xF) << 8) |
((PAN_ID64_VERSION_MAJOR(gpu_id) & 0xF) << 4) |
(PAN_ID64_VERSION_MINOR(gpu_id) & 0xF));
return (gpu_id & 0xFFFFFFFF);
}
void
panvk_per_arch(get_physical_device_properties)(
const struct panvk_instance *instance,
@ -750,8 +762,17 @@ panvk_per_arch(get_physical_device_properties)(
const bool has_disk_cache = device->vk.disk_cache != NULL;
/* Calculate the value using register count on v15+.
* TODO: As this requires register allocation changes ensuring we don't
* violate the limits based on the workgroup size, clamp the value to half of
* the max threads value (always safe and matches previous GPUs) for now. */
unsigned max_threads_per_wg =
(PAN_ARCH >= 15)
? MIN2(pan_compute_max_thread_count(&device->kmod.dev->props, 32),
device->kmod.dev->props.max_threads_per_core / 2)
: device->kmod.dev->props.max_threads_per_wg;
/* Ensure that the max threads count per workgroup is valid for Bifrost */
assert(PAN_ARCH > 8 || device->kmod.dev->props.max_threads_per_wg <= 1024);
assert(PAN_ARCH > 8 || max_threads_per_wg <= 1024);
float pointSizeRangeMin;
float pointSizeRangeMax;
@ -770,7 +791,7 @@ panvk_per_arch(get_physical_device_properties)(
.driverVersion = vk_get_driver_version(),
.vendorID =
instance->force_vk_vendor ? instance->force_vk_vendor : ARM_VENDOR_ID,
.deviceID = device->kmod.dev->props.gpu_id,
.deviceID = get_device_id(device->kmod.dev->props.gpu_id),
.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
/* Vulkan 1.0 limits */
@ -880,11 +901,9 @@ panvk_per_arch(get_physical_device_properties)(
/* We could also split into serveral jobs but this has many limitations.
* As such we limit to the max threads per workgroup supported by the GPU.
*/
.maxComputeWorkGroupInvocations =
device->kmod.dev->props.max_threads_per_wg,
.maxComputeWorkGroupSize = {device->kmod.dev->props.max_threads_per_wg,
device->kmod.dev->props.max_threads_per_wg,
device->kmod.dev->props.max_threads_per_wg},
.maxComputeWorkGroupInvocations = max_threads_per_wg,
.maxComputeWorkGroupSize = {max_threads_per_wg, max_threads_per_wg,
max_threads_per_wg},
/* 8-bit subpixel precision. */
.subPixelPrecisionBits = 8,
.subTexelPrecisionBits = 8,
@ -1075,8 +1094,7 @@ panvk_per_arch(get_physical_device_properties)(
.minSubgroupSize = pan_subgroup_size(PAN_ARCH),
.maxSubgroupSize = pan_subgroup_size(PAN_ARCH),
.maxComputeWorkgroupSubgroups =
device->kmod.dev->props.max_threads_per_wg /
pan_subgroup_size(PAN_ARCH),
max_threads_per_wg / pan_subgroup_size(PAN_ARCH),
.requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT,
.maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE,
.maxPerStageDescriptorInlineUniformBlocks =

View file

@ -1172,10 +1172,15 @@ panvk_shader_upload(struct panvk_device *dev,
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
#endif
#if PAN_ARCH >= 15
cfg.register_count = shader->info.work_reg_count;
cfg.preload.r0_r15 = shader->info.preload;
#else
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
cfg.preload.r48_r63 = (shader->info.preload >> 48);
#endif
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
@ -1191,10 +1196,15 @@ panvk_shader_upload(struct panvk_device *dev,
panvk_priv_mem_write_desc(shader->spds.all_points, 0, SHADER_PROGRAM,
cfg) {
cfg.stage = pan_shader_stage(&shader->info);
#if PAN_ARCH >= 15
cfg.register_count = shader->info.work_reg_count;
cfg.preload.r0_r15 = shader->info.preload;
#else
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
cfg.preload.r48_r63 = (shader->info.preload >> 48);
#endif
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
}
@ -1206,11 +1216,16 @@ panvk_shader_upload(struct panvk_device *dev,
panvk_priv_mem_write_desc(shader->spds.all_triangles, 0, SHADER_PROGRAM,
cfg) {
cfg.stage = pan_shader_stage(&shader->info);
#if PAN_ARCH >= 15
cfg.register_count = shader->info.work_reg_count;
cfg.preload.r0_r15 = shader->info.preload;
#else
cfg.register_allocation =
pan_register_allocation(shader->info.work_reg_count);
cfg.preload.r48_r63 = (shader->info.preload >> 48);
#endif
cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
shader->info.vs.no_psiz_offset;
cfg.preload.r48_r63 = (shader->info.preload >> 48);
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
}
#else