mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-05-08 02:38:04 +02:00
Merge branch 'panfrost-v15' into 'main'
Draft: panfrost: Add v15 support See merge request mesa/mesa!41366
This commit is contained in:
commit
7731477125
92 changed files with 10165 additions and 769 deletions
|
|
@ -34,6 +34,10 @@ The following hardware is currently supported:
|
|||
+--------------------+---------------+-----------+--------+--------+
|
||||
| G725 | 5th Gen (v13) | 3.1 | 3.1 | 1.4 |
|
||||
+--------------------+---------------+-----------+--------+--------+
|
||||
| G1-Pro | 5th Gen (v14) | 3.1 | 3.1 | 1.4 |
|
||||
+--------------------+---------------+-----------+--------+--------+
|
||||
| TMAx | 5th Gen (v15) | 3.1 | 3.1 | 1.4 |
|
||||
+--------------------+---------------+-----------+--------+--------+
|
||||
|
||||
Other Midgard and Bifrost chips (e.g. G71) are not yet supported.
|
||||
|
||||
|
|
|
|||
|
|
@ -350,7 +350,7 @@ struct drm_panthor_gpu_info {
|
|||
__u32 as_present;
|
||||
|
||||
/**
|
||||
* @select_coherency: Coherency selected for this device.
|
||||
* @selected_coherency: Coherency selected for this device.
|
||||
*
|
||||
* One of drm_panthor_gpu_coherency.
|
||||
*/
|
||||
|
|
@ -368,11 +368,27 @@ struct drm_panthor_gpu_info {
|
|||
/** @core_features: Used to discriminate core variants when they exist. */
|
||||
__u32 core_features;
|
||||
|
||||
/** @pad: MBZ. */
|
||||
__u32 pad;
|
||||
/** @thread_num_active_granularity: Granularity of number of active threads */
|
||||
__u32 thread_num_active_granularity;
|
||||
|
||||
/** @gpu_features: Bitmask describing supported GPU-wide features */
|
||||
__u64 gpu_features;
|
||||
|
||||
/** @gpu_wide_id: 64-bit GPU_ID for v15 onwards. */
|
||||
__u64 gpu_wide_id;
|
||||
#define DRM_PANTHOR_WIDE_ARCH_MAJOR(x) (((x) >> 56) & 0xff)
|
||||
#define DRM_PANTHOR_WIDE_ARCH_MINOR(x) (((x) >> 48) & 0xff)
|
||||
#define DRM_PANTHOR_WIDE_ARCH_REV(x) (((x) >> 40) & 0xff)
|
||||
#define DRM_PANTHOR_WIDE_PRODUCT_MAJOR(x) (((x) >> 32) & 0xff)
|
||||
#define DRM_PANTHOR_WIDE_VERSION_MAJOR(x) (((x) >> 16) & 0xff)
|
||||
#define DRM_PANTHOR_WIDE_VERSION_MINOR(x) (((x) >> 8) & 0xff)
|
||||
#define DRM_PANTHOR_WIDE_VERSION_STATUS(x) ((x) & 0xff)
|
||||
|
||||
/** @gpu_rev_wide: 64-bit GPU revision for v15 onwards */
|
||||
__u64 gpu_rev_wide;
|
||||
|
||||
/** @l2_features_wide: 64-bit L2_FEATURES for v15 onwards */
|
||||
__u64 l2_features_wide;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -409,6 +425,38 @@ struct drm_panthor_csif_info {
|
|||
__u32 pad;
|
||||
};
|
||||
|
||||
/**
|
||||
* enum drm_panthor_timestamp_info_flags - drm_panthor_timestamp_info.flags
|
||||
*/
|
||||
enum drm_panthor_timestamp_info_flags {
|
||||
/** @DRM_PANTHOR_TIMESTAMP_GPU: Query GPU time. */
|
||||
DRM_PANTHOR_TIMESTAMP_GPU = 1 << 0,
|
||||
|
||||
/** @DRM_PANTHOR_TIMESTAMP_CPU_NONE: Don't query CPU time. */
|
||||
DRM_PANTHOR_TIMESTAMP_CPU_NONE = 0 << 1,
|
||||
|
||||
/** @DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC: Query CPU time using CLOCK_MONOTONIC. */
|
||||
DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC = 1 << 1,
|
||||
|
||||
/** @DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW: Query CPU time using CLOCK_MONOTONIC_RAW. */
|
||||
DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW = 2 << 1,
|
||||
|
||||
/** @DRM_PANTHOR_TIMESTAMP_CPU_TYPE_MASK: Space reserved for CPU clock type. */
|
||||
DRM_PANTHOR_TIMESTAMP_CPU_TYPE_MASK = 7 << 1,
|
||||
|
||||
/** @DRM_PANTHOR_TIMESTAMP_GPU_OFFSET: Query GPU offset. */
|
||||
DRM_PANTHOR_TIMESTAMP_GPU_OFFSET = 1 << 4,
|
||||
|
||||
/** @DRM_PANTHOR_TIMESTAMP_GPU_CYCLE_COUNT: Query GPU cycle count. */
|
||||
DRM_PANTHOR_TIMESTAMP_GPU_CYCLE_COUNT = 1 << 5,
|
||||
|
||||
/** @DRM_PANTHOR_TIMESTAMP_FREQ: Query timestamp frequency. */
|
||||
DRM_PANTHOR_TIMESTAMP_FREQ = 1 << 6,
|
||||
|
||||
/** @DRM_PANTHOR_TIMESTAMP_DURATION: Return duration of time query. */
|
||||
DRM_PANTHOR_TIMESTAMP_DURATION = 1 << 7,
|
||||
};
|
||||
|
||||
/**
|
||||
* struct drm_panthor_timestamp_info - Timestamp information
|
||||
*
|
||||
|
|
@ -421,11 +469,38 @@ struct drm_panthor_timestamp_info {
|
|||
*/
|
||||
__u64 timestamp_frequency;
|
||||
|
||||
/** @current_timestamp: The current timestamp. */
|
||||
/** @current_timestamp: The current GPU timestamp. */
|
||||
__u64 current_timestamp;
|
||||
|
||||
/** @timestamp_offset: The offset of the timestamp timer. */
|
||||
/** @timestamp_offset: The offset of the GPU timestamp timer. */
|
||||
__u64 timestamp_offset;
|
||||
|
||||
/**
|
||||
* @flags: Bitmask of drm_panthor_timestamp_info_flags.
|
||||
*
|
||||
* If set to 0, then it is interpreted as:
|
||||
* DRM_PANTHOR_TIMESTAMP_GPU |
|
||||
* DRM_PANTHOR_TIMESTAMP_GPU_OFFSET |
|
||||
* DRM_PANTHOR_TIMESTAMP_FREQ
|
||||
*
|
||||
* Note: these flags are exclusive to each other (only one can be used):
|
||||
* - DRM_PANTHOR_TIMESTAMP_CPU_NONE
|
||||
* - DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC
|
||||
* - DRM_PANTHOR_TIMESTAMP_CPU_MONOTONIC_RAW
|
||||
*/
|
||||
__u32 flags;
|
||||
|
||||
/** @duration_nsec: Duration of time query. */
|
||||
__u32 duration_nsec;
|
||||
|
||||
/** @cycle_count: Value of GPU_CYCLE_COUNT. */
|
||||
__u64 cycle_count;
|
||||
|
||||
/** @cpu_timestamp_sec: Seconds part of CPU timestamp. */
|
||||
__u64 cpu_timestamp_sec;
|
||||
|
||||
/** @cpu_timestamp_nsec: Nanseconds part of CPU timestamp. */
|
||||
__u64 cpu_timestamp_nsec;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ compile_args_panfrost = [
|
|||
'-Wno-pointer-arith'
|
||||
]
|
||||
|
||||
panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13']
|
||||
panfrost_versions = ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
|
||||
libpanfrost_versions = []
|
||||
|
||||
foreach ver : panfrost_versions
|
||||
|
|
@ -54,7 +54,7 @@ foreach ver : panfrost_versions
|
|||
]
|
||||
if ver in ['4', '5', '6', '7', '9']
|
||||
files_panfrost_vx += ['pan_jm.c']
|
||||
elif ver in ['10', '12', '13']
|
||||
elif ver in ['10', '12', '13', '14', '15']
|
||||
files_panfrost_vx += ['pan_csf.c']
|
||||
endif
|
||||
libpanfrost_versions += static_library(
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@
|
|||
* functions. */
|
||||
#if PAN_ARCH <= 9
|
||||
#define JOBX(__suffix) GENX(jm_##__suffix)
|
||||
#elif PAN_ARCH <= 13
|
||||
#elif PAN_ARCH <= 15
|
||||
#define JOBX(__suffix) GENX(csf_##__suffix)
|
||||
#else
|
||||
#error "Unsupported arch"
|
||||
|
|
@ -1661,7 +1661,8 @@ panfrost_emit_shared_memory(struct panfrost_batch *batch,
|
|||
.tls.size = ss->info.tls_size,
|
||||
.wls.size = ss->info.wls_size + grid->variable_shared_mem,
|
||||
.wls.instances = pan_calc_wls_instances(
|
||||
&local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim),
|
||||
&local_size, &dev->kmod.dev->props, grid->indirect ? NULL : &dim,
|
||||
ss->info.work_reg_count),
|
||||
};
|
||||
|
||||
if (ss->info.tls_size) {
|
||||
|
|
@ -4455,11 +4456,15 @@ prepare_shader(struct panfrost_compiled_shader *state,
|
|||
else if (vs)
|
||||
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 15
|
||||
cfg.register_count = state->info.work_reg_count;
|
||||
cfg.preload.r0_r15 = state->info.preload;
|
||||
#else
|
||||
cfg.register_allocation =
|
||||
pan_register_allocation(state->info.work_reg_count);
|
||||
cfg.binary = state->bin.gpu;
|
||||
cfg.preload.r48_r63 = (state->info.preload >> 48);
|
||||
#endif
|
||||
cfg.binary = state->bin.gpu;
|
||||
cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
|
||||
|
||||
if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
|
||||
|
|
@ -4475,10 +4480,15 @@ prepare_shader(struct panfrost_compiled_shader *state,
|
|||
#if PAN_ARCH < 12
|
||||
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
|
||||
#endif
|
||||
#if PAN_ARCH >= 15
|
||||
cfg.register_count = state->info.work_reg_count;
|
||||
cfg.preload.r0_r15 = state->info.preload;
|
||||
#else
|
||||
cfg.register_allocation =
|
||||
pan_register_allocation(state->info.work_reg_count);
|
||||
cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
|
||||
cfg.preload.r48_r63 = (state->info.preload >> 48);
|
||||
#endif
|
||||
cfg.binary = state->bin.gpu + state->info.vs.no_psiz_offset;
|
||||
cfg.flush_to_zero_mode = panfrost_ftz_mode(&state->info);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2023 Collabora Ltd.
|
||||
* Copyright (C) 2026 Arm Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
|
|
@ -13,6 +14,7 @@
|
|||
#include "pan_cmdstream.h"
|
||||
#include "pan_context.h"
|
||||
#include "pan_csf.h"
|
||||
#include "pan_fb.h"
|
||||
#include "pan_fb_preload.h"
|
||||
#include "pan_job.h"
|
||||
#include "pan_trace.h"
|
||||
|
|
@ -75,6 +77,99 @@ csf_update_tiler_oom_ctx(struct cs_builder *b, uint64_t addr)
|
|||
(PAN_INCREMENTAL_RENDERING_##_pass##_PASS * sizeof(struct pan_ptr)) + \
|
||||
offsetof(struct pan_ptr, gpu))
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
static void
|
||||
cs_emit_static_fragment_state(struct cs_builder *b,
|
||||
struct panfrost_batch *batch,
|
||||
const struct pan_fb_info *fb)
|
||||
{
|
||||
struct mali_fragment_bounding_box_packed bbox;
|
||||
pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) {
|
||||
cfg.bound_min_x = batch->minx;
|
||||
cfg.bound_min_y = batch->miny;
|
||||
cfg.bound_max_x = batch->maxx - 1;
|
||||
cfg.bound_max_y = batch->maxy - 1;
|
||||
}
|
||||
|
||||
struct mali_frame_size_packed frame_size;
|
||||
pan_pack(&frame_size, FRAME_SIZE, cfg) {
|
||||
cfg.width = fb->width;
|
||||
cfg.height = fb->height;
|
||||
}
|
||||
|
||||
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX),
|
||||
bbox.opaque[0] | ((uint64_t)bbox.opaque[1] << 32));
|
||||
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]);
|
||||
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER),
|
||||
fb->sample_positions);
|
||||
|
||||
struct mali_fragment_flags_1_packed flags1;
|
||||
pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) {
|
||||
/* The force_samples setting dictates the sample-count that is used
|
||||
* for rasterization, and works like D3D11's ForcedSampleCount
|
||||
* feature:
|
||||
*
|
||||
* - If force_samples == 0: Let nr_samples dictate sample count
|
||||
* - If force_samples == 1: force single-sampled rasterization
|
||||
* - If force_samples >= 1: force multi-sampled rasterization
|
||||
*
|
||||
* This can be used to read SYSTEM_VALUE_SAMPLE_MASK_IN from the
|
||||
* fragment shader, even when performing single-sampled rendering.
|
||||
*/
|
||||
if (fb->pls_enabled) {
|
||||
cfg.sample_count = 4;
|
||||
cfg.sample_pattern = pan_sample_pattern(1);
|
||||
} else if (!fb->force_samples) {
|
||||
cfg.sample_count = fb->nr_samples;
|
||||
cfg.sample_pattern = pan_sample_pattern(fb->nr_samples);
|
||||
} else if (fb->force_samples == 1) {
|
||||
cfg.sample_count = fb->nr_samples;
|
||||
cfg.sample_pattern = pan_sample_pattern(1);
|
||||
} else {
|
||||
cfg.sample_count = 1;
|
||||
cfg.sample_pattern = pan_sample_pattern(fb->force_samples);
|
||||
}
|
||||
|
||||
cfg.effective_tile_size = fb->tile_size;
|
||||
cfg.point_sprite_coord_origin_max_y = fb->sprite_coord_origin;
|
||||
cfg.first_provoking_vertex = fb->first_provoking_vertex;
|
||||
cfg.render_target_count = MAX2(fb->rt_count, 1);
|
||||
cfg.color_buffer_allocation = fb->cbuf_allocation;
|
||||
}
|
||||
|
||||
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]);
|
||||
|
||||
/* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */
|
||||
}
|
||||
|
||||
#define PAN_CS_REG_FBD_LAYER_PTR 54
|
||||
|
||||
static inline void
|
||||
cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr)
|
||||
{
|
||||
/* Emit the dynamic fragment state. This state may change per-layer. */
|
||||
|
||||
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, flags0));
|
||||
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, flags2));
|
||||
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, z_clear));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, tiler));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, rtd_pointer));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, dbd_pointer));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, frame_argument));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, dcd_pointer));
|
||||
|
||||
cs_flush_loads(b);
|
||||
}
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
|
||||
static int
|
||||
csf_oom_handler_init(struct panfrost_context *ctx)
|
||||
{
|
||||
|
|
@ -113,13 +208,18 @@ csf_oom_handler_init(struct panfrost_context *ctx)
|
|||
|
||||
cs_function_def(&b, &handler, handler_ctx) {
|
||||
struct cs_index tiler_oom_ctx = cs_reg64(&b, TILER_OOM_CTX_REG);
|
||||
struct cs_index counter = cs_reg32(&b, 47);
|
||||
struct cs_index zero = cs_reg64(&b, 48);
|
||||
struct cs_index flush_id = cs_reg32(&b, 48);
|
||||
struct cs_index tiler_ctx = cs_reg64(&b, 50);
|
||||
struct cs_index completed_top = cs_reg64(&b, 52);
|
||||
struct cs_index completed_bottom = cs_reg64(&b, 54);
|
||||
struct cs_index completed_chunks = cs_reg_tuple(&b, 52, 4);
|
||||
struct cs_index counter = cs_reg32(&b, 31);
|
||||
struct cs_index zero = cs_reg64(&b, 56);
|
||||
struct cs_index flush_id = cs_reg32(&b, 58);
|
||||
struct cs_index tiler_ctx = cs_reg64(&b, 60);
|
||||
struct cs_index completed_top = cs_reg64(&b, 64);
|
||||
struct cs_index completed_bottom = cs_reg64(&b, 66);
|
||||
struct cs_index completed_chunks = cs_reg_tuple(&b, 64, 4);
|
||||
#if PAN_ARCH >= 14
|
||||
struct cs_index fbd_pointer = cs_reg64(&b, PAN_CS_REG_FBD_LAYER_PTR);
|
||||
#else
|
||||
struct cs_index fbd_pointer = cs_sr_reg64(&b, FRAGMENT, FBD_POINTER);
|
||||
#endif
|
||||
|
||||
/* Ensure that the OTHER endpoint is valid */
|
||||
#if PAN_ARCH >= 11
|
||||
|
|
@ -133,25 +233,31 @@ csf_oom_handler_init(struct panfrost_context *ctx)
|
|||
cs_load32_to(&b, counter, tiler_oom_ctx, FIELD_OFFSET(counter));
|
||||
cs_wait_slot(&b, 0);
|
||||
cs_if(&b, MALI_CS_CONDITION_GREATER, counter) {
|
||||
cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx,
|
||||
FBD_OFFSET(MIDDLE));
|
||||
cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(MIDDLE));
|
||||
}
|
||||
cs_else(&b) {
|
||||
cs_load64_to(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER), tiler_oom_ctx,
|
||||
FBD_OFFSET(FIRST));
|
||||
cs_load64_to(&b, fbd_pointer, tiler_oom_ctx, FBD_OFFSET(FIRST));
|
||||
}
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
cs_emit_layer_fragment_state(&b, fbd_pointer);
|
||||
#else
|
||||
cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MIN), tiler_oom_ctx,
|
||||
FIELD_OFFSET(bbox_min));
|
||||
cs_load32_to(&b, cs_sr_reg32(&b, FRAGMENT, BBOX_MAX), tiler_oom_ctx,
|
||||
FIELD_OFFSET(bbox_max));
|
||||
cs_move64_to(&b, cs_sr_reg64(&b, FRAGMENT, TEM_POINTER), 0);
|
||||
cs_move32_to(&b, cs_sr_reg32(&b, FRAGMENT, TEM_ROW_STRIDE), 0);
|
||||
#endif
|
||||
cs_wait_slot(&b, 0);
|
||||
|
||||
/* Run the fragment job and wait */
|
||||
cs_select_endpoint_sb(&b, 3);
|
||||
#if PAN_ARCH >= 14
|
||||
cs_run_fragment2(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
#else
|
||||
cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
#endif
|
||||
cs_wait_slot(&b, 3);
|
||||
|
||||
/* Increment counter */
|
||||
|
|
@ -218,6 +324,21 @@ GENX(csf_cleanup_batch)(struct panfrost_batch *batch)
|
|||
panfrost_pool_cleanup(&batch->csf.cs_chunk_pool);
|
||||
}
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
static inline struct pan_ptr
|
||||
alloc_fbd(struct panfrost_batch *batch)
|
||||
{
|
||||
const struct pan_desc_alloc_info fbd_layer = {
|
||||
.size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64),
|
||||
.align = alignof(struct pan_fbd_layer),
|
||||
.nelems = 1,
|
||||
};
|
||||
|
||||
return pan_pool_alloc_desc_aggregate(
|
||||
&batch->pool.base, fbd_layer, PAN_DESC(ZS_CRC_EXTENSION),
|
||||
PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
|
||||
}
|
||||
#else
|
||||
static inline struct pan_ptr
|
||||
alloc_fbd(struct panfrost_batch *batch)
|
||||
{
|
||||
|
|
@ -225,6 +346,7 @@ alloc_fbd(struct panfrost_batch *batch)
|
|||
&batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
|
||||
PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
|
||||
}
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
|
||||
int
|
||||
GENX(csf_init_batch)(struct panfrost_batch *batch)
|
||||
|
|
@ -758,7 +880,7 @@ GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
|
|||
(_ctx)->fbds[PAN_INCREMENTAL_RENDERING_##_pass##_PASS]
|
||||
#define EMIT_FBD(_ctx, _pass, _fb, _tls, _tiler_ctx) \
|
||||
GET_FBD(_ctx, _pass).gpu |= \
|
||||
GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass).cpu)
|
||||
GENX(pan_emit_fbd)(_fb, 0, _tls, _tiler_ctx, GET_FBD(_ctx, _pass))
|
||||
|
||||
void
|
||||
GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
|
||||
|
|
@ -771,7 +893,7 @@ GENX(csf_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
|
|||
/* Default framebuffer descriptor */
|
||||
|
||||
batch->framebuffer.gpu |=
|
||||
GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
|
||||
GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer);
|
||||
|
||||
if (batch->draw_count == 0)
|
||||
return;
|
||||
|
|
@ -854,15 +976,26 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
|
|||
cs_vt_end(b, cs_now());
|
||||
}
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
struct cs_index fbd_pointer = cs_reg64(b, PAN_CS_REG_FBD_LAYER_PTR);
|
||||
#else
|
||||
struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
|
||||
#endif
|
||||
|
||||
/* Set up the fragment job */
|
||||
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
|
||||
batch->framebuffer.gpu);
|
||||
cs_move64_to(b, fbd_pointer, batch->framebuffer.gpu);
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
cs_emit_static_fragment_state(b, batch, pfb);
|
||||
cs_emit_layer_fragment_state(b, fbd_pointer);
|
||||
#else
|
||||
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
|
||||
(batch->miny << 16) | batch->minx);
|
||||
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MAX),
|
||||
((batch->maxy - 1) << 16) | (batch->maxx - 1));
|
||||
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, TEM_POINTER), 0);
|
||||
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, TEM_ROW_STRIDE), 0);
|
||||
#endif
|
||||
|
||||
/* Use different framebuffer descriptor if incremental rendering was
|
||||
* triggered while tiling */
|
||||
|
|
@ -871,13 +1004,19 @@ GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
|
|||
cs_load32_to(b, counter, cs_reg64(b, TILER_OOM_CTX_REG), 0);
|
||||
cs_wait_slot(b, 0);
|
||||
cs_if(b, MALI_CS_CONDITION_GREATER, counter) {
|
||||
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
|
||||
GET_FBD(oom_ctx, LAST).gpu);
|
||||
cs_move64_to(b, fbd_pointer, GET_FBD(oom_ctx, LAST).gpu);
|
||||
#if PAN_ARCH >= 14
|
||||
cs_emit_layer_fragment_state(b, fbd_pointer);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/* Run the fragment job and wait */
|
||||
#if PAN_ARCH >= 14
|
||||
cs_run_fragment2(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
#else
|
||||
cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
#endif
|
||||
cs_wait_slot(b, 2);
|
||||
|
||||
/* Gather freed heap chunks and add them to the heap context free list
|
||||
|
|
|
|||
|
|
@ -1105,9 +1105,14 @@ pan_preload_emit_dcd(struct pan_fb_preload_cache *cache, struct pan_pool *pool,
|
|||
pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
|
||||
cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
|
||||
cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
|
||||
#if PAN_ARCH >= 15
|
||||
cfg.register_count = preload_shader->info.work_reg_count;
|
||||
cfg.preload.r0_r15 = preload_shader->info.preload;
|
||||
#else
|
||||
cfg.register_allocation = MALI_SHADER_REGISTER_ALLOCATION_32_PER_THREAD;
|
||||
cfg.binary = preload_shader->address;
|
||||
cfg.preload.r48_r63 = preload_shader->info.preload >> 48;
|
||||
#endif
|
||||
cfg.binary = preload_shader->address;
|
||||
}
|
||||
|
||||
unsigned bd_count = views.rt_count;
|
||||
|
|
|
|||
|
|
@ -257,8 +257,8 @@ GENX(jm_emit_fbds)(struct panfrost_batch *batch, struct pan_fb_info *fb,
|
|||
{
|
||||
PAN_TRACE_FUNC(PAN_TRACE_GL_JM);
|
||||
|
||||
batch->framebuffer.gpu |= GENX(pan_emit_fbd)(
|
||||
fb, 0, tls, &batch->tiler_ctx, batch->framebuffer.cpu);
|
||||
batch->framebuffer.gpu |=
|
||||
GENX(pan_emit_fbd)(fb, 0, tls, &batch->tiler_ctx, batch->framebuffer);
|
||||
}
|
||||
|
||||
void
|
||||
|
|
|
|||
|
|
@ -98,10 +98,15 @@ panfrost_precomp_shader_create(
|
|||
|
||||
pan_cast_and_pack(spd.cpu, SHADER_PROGRAM, cfg) {
|
||||
cfg.stage = pan_shader_stage(&res->info);
|
||||
#if PAN_ARCH >= 15
|
||||
cfg.register_count = res->info.work_reg_count;
|
||||
cfg.preload.r0_r15 = res->info.preload;
|
||||
#else
|
||||
cfg.register_allocation =
|
||||
pan_register_allocation(res->info.work_reg_count);
|
||||
cfg.binary = res->code_ptr;
|
||||
cfg.preload.r48_r63 = (res->info.preload >> 48);
|
||||
#endif
|
||||
cfg.binary = res->code_ptr;
|
||||
cfg.flush_to_zero_mode = panfrost_ftz_mode(&res->info);
|
||||
}
|
||||
|
||||
|
|
@ -197,8 +202,9 @@ emit_tls(struct panfrost_batch *batch,
|
|||
struct pan_tls_info info = {
|
||||
.tls.size = shader->info.tls_size,
|
||||
.wls.size = shader->info.wls_size,
|
||||
.wls.instances = pan_calc_wls_instances(&shader->local_size,
|
||||
&dev->kmod.dev->props, dim),
|
||||
.wls.instances =
|
||||
pan_calc_wls_instances(&shader->local_size, &dev->kmod.dev->props, dim,
|
||||
shader->info.work_reg_count),
|
||||
};
|
||||
|
||||
if (info.tls.size) {
|
||||
|
|
@ -325,7 +331,17 @@ GENX(panfrost_launch_precomp)(struct panfrost_batch *batch,
|
|||
uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
|
||||
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, FAU_0), fau_ptr);
|
||||
|
||||
#if PAN_ARCH >= 15
|
||||
struct mali_shader_program_pointer_packed spp;
|
||||
pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
|
||||
ctx.register_count = shader->info.work_reg_count;
|
||||
ctx.pointer = shader->state_ptr;
|
||||
}
|
||||
uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
|
||||
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), ptr);
|
||||
#else
|
||||
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, SPD_0), shader->state_ptr);
|
||||
#endif
|
||||
cs_move64_to(b, cs_sr_reg64(b, COMPUTE, TSD_0), tsd);
|
||||
|
||||
/* Global attribute offset */
|
||||
|
|
|
|||
|
|
@ -1175,6 +1175,12 @@ panfrost_create_screen(int fd, const struct pipe_screen_config *config,
|
|||
case 13:
|
||||
panfrost_cmdstream_screen_init_v13(screen);
|
||||
break;
|
||||
case 14:
|
||||
panfrost_cmdstream_screen_init_v14(screen);
|
||||
break;
|
||||
case 15:
|
||||
panfrost_cmdstream_screen_init_v15(screen);
|
||||
break;
|
||||
default:
|
||||
debug_printf("panfrost: Unhandled architecture major %d", dev->arch);
|
||||
panfrost_destroy_screen(&(screen->base));
|
||||
|
|
|
|||
|
|
@ -155,6 +155,8 @@ void panfrost_cmdstream_screen_init_v9(struct panfrost_screen *screen);
|
|||
void panfrost_cmdstream_screen_init_v10(struct panfrost_screen *screen);
|
||||
void panfrost_cmdstream_screen_init_v12(struct panfrost_screen *screen);
|
||||
void panfrost_cmdstream_screen_init_v13(struct panfrost_screen *screen);
|
||||
void panfrost_cmdstream_screen_init_v14(struct panfrost_screen *screen);
|
||||
void panfrost_cmdstream_screen_init_v15(struct panfrost_screen *screen);
|
||||
|
||||
#define perf_debug(ctx, ...) \
|
||||
do { \
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
#include "panfrost/compiler/bifrost/bifrost_compile.h"
|
||||
#include "panfrost/compiler/pan_compiler.h"
|
||||
#include "panfrost/compiler/pan_nir.h"
|
||||
#include "panfrost/model/pan_model.h"
|
||||
#include "nir.h"
|
||||
#include "nir_builder.h"
|
||||
#include "nir_builder_opcodes.h"
|
||||
|
|
@ -275,7 +276,7 @@ main(int argc, const char **argv)
|
|||
|
||||
unsigned target_arch = atoi(target_arch_str);
|
||||
|
||||
if (target_arch < 4 || target_arch > 13) {
|
||||
if (target_arch < 4 || target_arch > 15) {
|
||||
fprintf(stderr, "Unsupported target arch %d\n", target_arch);
|
||||
return 1;
|
||||
}
|
||||
|
|
@ -353,7 +354,12 @@ main(int argc, const char **argv)
|
|||
libfunc, MESA_SHADER_COMPUTE, v, get_compiler_options(target_arch),
|
||||
&opt, load_kernel_input);
|
||||
|
||||
uint64_t target_gpu_id = (target_arch & 0xf) << 28;
|
||||
uint64_t target_gpu_id;
|
||||
if (target_arch >= PAN_ID64_COMPAT)
|
||||
target_gpu_id =
|
||||
((uint64_t)(target_arch & 0xff) << 56) | (PAN_ID64_COMPAT << 28);
|
||||
else
|
||||
target_gpu_id = (target_arch & 0xf) << 28;
|
||||
|
||||
struct pan_compile_inputs inputs = {
|
||||
.gpu_id = target_gpu_id,
|
||||
|
|
|
|||
|
|
@ -16,14 +16,14 @@
|
|||
*/
|
||||
|
||||
static uint32_t
|
||||
va_op_swizzles(enum bi_opcode op, unsigned src)
|
||||
va_op_swizzles(enum bi_opcode op, unsigned src, unsigned arch)
|
||||
{
|
||||
/* This is a bifrost-only instruction that is lowered on valhall */
|
||||
if (!valhall_opcodes[op].exact)
|
||||
if (!get_valhall_opcode(op, arch).exact)
|
||||
return bi_op_swizzles[op][src];
|
||||
|
||||
uint32_t swizzles = 0;
|
||||
struct va_src_info info = va_src_info(op, src);
|
||||
struct va_src_info info = va_src_info(op, src, arch);
|
||||
|
||||
if (info.swizzle) {
|
||||
assert(info.size == VA_SIZE_16 || info.size == VA_SIZE_32);
|
||||
|
|
@ -99,8 +99,8 @@ bool
|
|||
bi_op_supports_swizzle(enum bi_opcode op, unsigned src,
|
||||
enum bi_swizzle swizzle, unsigned arch)
|
||||
{
|
||||
uint32_t supported_swizzles = arch >= 9 ?
|
||||
va_op_swizzles(op, src) : bi_op_swizzles[op][src];
|
||||
uint32_t supported_swizzles =
|
||||
arch >= 9 ? va_op_swizzles(op, src, arch) : bi_op_swizzles[op][src];
|
||||
return supported_swizzles & BITFIELD_BIT(swizzle);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -294,7 +294,8 @@ bi_compute_liveness_ra(bi_context *ctx)
|
|||
#define EVEN_BITS_MASK (0x5555555555555555ull)
|
||||
|
||||
static uint64_t
|
||||
bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
|
||||
bi_make_affinity(uint64_t clobber, unsigned count, bool split_file,
|
||||
unsigned arch)
|
||||
{
|
||||
uint64_t clobbered = 0;
|
||||
|
||||
|
|
@ -308,12 +309,12 @@ bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
|
|||
clobbered |= mask << (64 - excess);
|
||||
|
||||
if (split_file)
|
||||
clobbered |= mask << (16 - excess);
|
||||
clobbered |= mask << (((arch >= 15) ? 32 : 16) - excess);
|
||||
}
|
||||
|
||||
/* Don't allocate the middle if we split out the middle */
|
||||
if (split_file)
|
||||
clobbered |= BITFIELD64_MASK(32) << 16;
|
||||
clobbered |= BITFIELD64_MASK(32) << ((arch >= 15) ? 32 : 16);
|
||||
|
||||
/* We can use a register iff it's not clobberred */
|
||||
return ~clobbered;
|
||||
|
|
@ -341,7 +342,7 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
|
|||
unsigned count = bi_count_write_registers(ins, d);
|
||||
unsigned offset = ins->dest[d].offset;
|
||||
uint64_t affinity =
|
||||
bi_make_affinity(preload_live, count, split_file) >> offset;
|
||||
bi_make_affinity(preload_live, count, split_file, arch) >> offset;
|
||||
/* Valhall needs >= 64-bit staging writes to be pair-aligned */
|
||||
if (aligned_sr && (count >= 2 || offset))
|
||||
affinity &= EVEN_BITS_MASK;
|
||||
|
|
@ -381,8 +382,8 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
|
|||
bi_foreach_ssa_src(ins, s) {
|
||||
if (bi_count_read_registers(ins, s) >= 2)
|
||||
l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
|
||||
else if (s < valhall_opcodes[ins->op].nr_srcs &&
|
||||
va_src_info(ins->op, s).size > VA_SIZE_32)
|
||||
else if (s < get_valhall_opcode(ins->op, arch).nr_srcs &&
|
||||
va_src_info(ins->op, s, arch).size > VA_SIZE_32)
|
||||
l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
|
||||
}
|
||||
}
|
||||
|
|
@ -435,7 +436,8 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
|
|||
uint64_t default_affinity =
|
||||
ctx->inputs->is_blend ? BITFIELD64_MASK(16)
|
||||
: full_regs ? BITFIELD64_MASK(64)
|
||||
: (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
|
||||
: (ctx->arch >= 15) ? BITFIELD64_MASK(32)
|
||||
: (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
|
||||
|
||||
/* To test spilling, mimic a small register file */
|
||||
if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend && (bifrost_debug & BIFROST_DBG_NOSSARA))
|
||||
|
|
|
|||
|
|
@ -703,8 +703,10 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr)
|
|||
assert(intr->intrinsic == nir_intrinsic_load_var_buf_pan ||
|
||||
intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan);
|
||||
|
||||
const unsigned arch = b->shader->arch;
|
||||
|
||||
/* These are only available on Valhall+ */
|
||||
assert(b->shader->arch >= 9);
|
||||
assert(arch >= 9);
|
||||
|
||||
const bool flat = intr->intrinsic == nir_intrinsic_load_var_buf_flat_pan;
|
||||
const nir_alu_type src_type = nir_intrinsic_src_type(intr);
|
||||
|
|
@ -757,19 +759,36 @@ bi_emit_load_var_buf(bi_builder *b, nir_intrinsic_instr *intr)
|
|||
bool use_imm_form = false;
|
||||
if (nir_src_is_const(intr->src[0])) {
|
||||
imm_offset = nir_src_as_uint(intr->src[0]);
|
||||
assert(imm_offset < pan_ld_var_buf_off_size(b->shader->arch));
|
||||
assert(imm_offset < pan_ld_var_buf_off_size(arch));
|
||||
|
||||
use_imm_form = true;
|
||||
}
|
||||
|
||||
/* On v14+, flat source formats are removed from LD_VAR_BUF/LD_VAR_BUF_IMM,
|
||||
* so flat buffer varyings must use the dedicated LD_VAR_BUF_FLAT*.
|
||||
*/
|
||||
if (use_imm_form) {
|
||||
bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
|
||||
if (arch >= 14 && flat) {
|
||||
bi_ld_var_buf_flat_imm_to(b, dest, regfmt, vecsize, imm_offset);
|
||||
} else {
|
||||
bi_ld_var_buf_imm_to(b, sz, dest, src0, regfmt, sample, source_format,
|
||||
BI_UPDATE_STORE, vecsize, imm_offset);
|
||||
}
|
||||
} else {
|
||||
bi_index offset = bi_src_index(&intr->src[0]);
|
||||
bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample,
|
||||
source_format, BI_UPDATE_STORE, vecsize);
|
||||
if (arch >= 14 && flat) {
|
||||
bi_ld_var_buf_flat_to(b, dest, offset, regfmt, vecsize);
|
||||
} else {
|
||||
bi_ld_var_buf_to(b, sz, dest, src0, offset, regfmt, sample,
|
||||
source_format, BI_UPDATE_STORE, vecsize);
|
||||
}
|
||||
}
|
||||
|
||||
/* LD_VAR_BUF_FLAT* only support register formats F16 and F32. */
|
||||
assert(
|
||||
arch < 14 || !flat ||
|
||||
(regfmt == BI_REGISTER_FORMAT_F16 || regfmt == BI_REGISTER_FORMAT_F32));
|
||||
|
||||
bi_split_def(b, &intr->def);
|
||||
}
|
||||
|
||||
|
|
@ -4146,13 +4165,13 @@ va_count_stats(bi_context *ctx, unsigned nr_ins, unsigned size,
|
|||
}
|
||||
|
||||
static unsigned
|
||||
va_gather_stats_block(bi_block *block, struct va_stats *counts)
|
||||
va_gather_stats_block(bi_block *block, unsigned arch, struct va_stats *counts)
|
||||
{
|
||||
unsigned nr_ins = 0;
|
||||
|
||||
bi_foreach_instr_in_block(block, I) {
|
||||
nr_ins++;
|
||||
va_count_instr_stats(I, counts);
|
||||
va_count_instr_stats(I, arch, counts);
|
||||
}
|
||||
return nr_ins;
|
||||
}
|
||||
|
|
@ -4161,7 +4180,8 @@ va_gather_stats_block(bi_block *block, struct va_stats *counts)
|
|||
* Gather stats for a minimum length path through the shader.
|
||||
*/
|
||||
static unsigned
|
||||
va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
|
||||
va_gather_min_path_stats(bi_block *block, unsigned arch,
|
||||
struct va_stats *counts)
|
||||
{
|
||||
struct va_stats min_counts;
|
||||
struct va_stats save_counts = *counts;
|
||||
|
|
@ -4173,7 +4193,7 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
|
|||
if (bi_block_dominates(next, block)) {
|
||||
continue;
|
||||
}
|
||||
nr_ins = va_gather_min_path_stats(next, counts);
|
||||
nr_ins = va_gather_min_path_stats(next, arch, counts);
|
||||
if (min_ins == 0 || nr_ins < min_ins) {
|
||||
min_ins = nr_ins;
|
||||
min_counts = *counts;
|
||||
|
|
@ -4183,7 +4203,7 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
|
|||
if (min_ins != 0) {
|
||||
*counts = min_counts;
|
||||
}
|
||||
nr_ins = min_ins + va_gather_stats_block(block, counts);
|
||||
nr_ins = min_ins + va_gather_stats_block(block, arch, counts);
|
||||
return nr_ins;
|
||||
}
|
||||
|
||||
|
|
@ -4194,7 +4214,8 @@ va_gather_min_path_stats(bi_block *block, struct va_stats *counts)
|
|||
* bail out.
|
||||
*/
|
||||
static unsigned
|
||||
va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *visited)
|
||||
va_gather_max_path_stats(bi_block *block, unsigned arch,
|
||||
struct va_stats *counts, BITSET_WORD *visited)
|
||||
{
|
||||
struct va_stats max_counts;
|
||||
struct va_stats save_counts = *counts;
|
||||
|
|
@ -4207,7 +4228,7 @@ va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *
|
|||
if (BITSET_TEST(visited, next->index)) {
|
||||
continue;
|
||||
}
|
||||
nr_ins = va_gather_max_path_stats(next, counts, visited);
|
||||
nr_ins = va_gather_max_path_stats(next, arch, counts, visited);
|
||||
if (nr_ins > max_ins) {
|
||||
max_ins = nr_ins;
|
||||
max_counts = *counts;
|
||||
|
|
@ -4217,7 +4238,7 @@ va_gather_max_path_stats(bi_block *block, struct va_stats *counts, BITSET_WORD *
|
|||
if (max_ins != 0) {
|
||||
*counts = max_counts;
|
||||
}
|
||||
nr_ins = max_ins + va_gather_stats_block(block, counts);
|
||||
nr_ins = max_ins + va_gather_stats_block(block, arch, counts);
|
||||
return nr_ins;
|
||||
}
|
||||
|
||||
|
|
@ -4241,15 +4262,16 @@ va_gather_stats(bi_context *ctx, unsigned size, struct valhall_stats *out,
|
|||
case GATHER_STATS_FULL:
|
||||
bi_foreach_instr_global(ctx, I) {
|
||||
nr_ins++;
|
||||
va_count_instr_stats(I, &counts);
|
||||
va_count_instr_stats(I, ctx->arch, &counts);
|
||||
}
|
||||
break;
|
||||
case GATHER_STATS_MIN:
|
||||
nr_ins = va_gather_min_path_stats(first_block, &counts);
|
||||
nr_ins = va_gather_min_path_stats(first_block, ctx->arch, &counts);
|
||||
break;
|
||||
case GATHER_STATS_MAX:
|
||||
visited = BITSET_RZALLOC(NULL, ctx->num_blocks);
|
||||
nr_ins = va_gather_max_path_stats(first_block, &counts, visited);
|
||||
nr_ins =
|
||||
va_gather_max_path_stats(first_block, ctx->arch, &counts, visited);
|
||||
ralloc_free(visited);
|
||||
break;
|
||||
}
|
||||
|
|
@ -4509,7 +4531,7 @@ bi_compile_variant_nir(nir_shader *nir,
|
|||
va_lower_constants(ctx, I, const_hist, min_count_for_fau);
|
||||
|
||||
bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
|
||||
va_repair_fau(&b, I);
|
||||
va_repair_fau(&b, I, ctx->arch);
|
||||
}
|
||||
|
||||
_mesa_hash_table_u64_destroy(const_hist);
|
||||
|
|
@ -4611,7 +4633,7 @@ bi_compile_variant_nir(nir_shader *nir,
|
|||
bifrost_debug & BIFROST_DBG_VERBOSE);
|
||||
} else {
|
||||
disassemble_valhall(stderr, binary->data + offset,
|
||||
binary->size - offset,
|
||||
binary->size - offset, ctx->arch,
|
||||
bifrost_debug & BIFROST_DBG_VERBOSE);
|
||||
}
|
||||
|
||||
|
|
@ -4679,7 +4701,7 @@ bi_compile_variant(nir_shader *nir,
|
|||
uint64_t preload = first_block->reg_live_in;
|
||||
|
||||
/* If multisampling is used with a blend shader, the blend shader needs
|
||||
* to access the sample coverage mask in r60 and the sample ID in r61.
|
||||
* to access the sample coverage mask and the sample ID.
|
||||
* Blend shaders run in the same context as fragment shaders, so if a
|
||||
* blend shader could run, we need to preload these registers
|
||||
* conservatively. There is believed to be little cost to doing so, so
|
||||
|
|
@ -4690,7 +4712,10 @@ bi_compile_variant(nir_shader *nir,
|
|||
* driver. We could unify the paths if the cost is acceptable.
|
||||
*/
|
||||
if (nir->info.stage == MESA_SHADER_FRAGMENT && ctx->arch >= 9)
|
||||
preload |= BITFIELD64_BIT(60) | BITFIELD64_BIT(61);
|
||||
preload |=
|
||||
BITFIELD64_BIT(
|
||||
bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch)) |
|
||||
BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_SAMPLE_ID, ctx->arch));
|
||||
|
||||
info->ubo_mask |= ctx->ubo_mask;
|
||||
info->tls_size = MAX2(info->tls_size, ctx->info.tls_size);
|
||||
|
|
|
|||
|
|
@ -48,7 +48,8 @@ disassemble(const char *filename)
|
|||
}
|
||||
|
||||
if (pan_arch(gpu_id) >= 9)
|
||||
disassemble_valhall(stdout, entrypoint, filesize, verbose);
|
||||
disassemble_valhall(stdout, entrypoint, filesize, pan_arch(gpu_id),
|
||||
verbose);
|
||||
else
|
||||
disassemble_bifrost(stdout, entrypoint, filesize, verbose);
|
||||
|
||||
|
|
|
|||
|
|
@ -1162,25 +1162,25 @@ bi_preload_reg(enum bi_preload val, unsigned arch)
|
|||
/* Compute */
|
||||
case BI_PRELOAD_LOCAL_ID_0:
|
||||
/* Bits [15;0] */
|
||||
return 55;
|
||||
return (arch >= 15) ? 4 : 55;
|
||||
case BI_PRELOAD_LOCAL_ID_1:
|
||||
/* Bits [31;16] */
|
||||
return 55;
|
||||
return (arch >= 15) ? 4 : 55;
|
||||
case BI_PRELOAD_LOCAL_ID_2:
|
||||
/* Bits [15;0] */
|
||||
return 56;
|
||||
return (arch >= 15) ? 3 : 56;
|
||||
case BI_PRELOAD_WORKGROUP_ID_0:
|
||||
return 57;
|
||||
return (arch >= 15) ? 5 : 57;
|
||||
case BI_PRELOAD_WORKGROUP_ID_1:
|
||||
return 58;
|
||||
return (arch >= 15) ? 6 : 58;
|
||||
case BI_PRELOAD_WORKGROUP_ID_2:
|
||||
return 59;
|
||||
return (arch >= 15) ? 7 : 59;
|
||||
case BI_PRELOAD_GLOBAL_ID_0:
|
||||
return 60;
|
||||
return (arch >= 15) ? 0 : 60;
|
||||
case BI_PRELOAD_GLOBAL_ID_1:
|
||||
return 61;
|
||||
return (arch >= 15) ? 1 : 61;
|
||||
case BI_PRELOAD_GLOBAL_ID_2:
|
||||
return 62;
|
||||
return (arch >= 15) ? 2 : 62;
|
||||
/* Vertex */
|
||||
case BI_PRELOAD_POS_RESULT_PTR_LO:
|
||||
assert(arch < 9);
|
||||
|
|
@ -1190,58 +1190,58 @@ bi_preload_reg(enum bi_preload val, unsigned arch)
|
|||
return 59;
|
||||
case BI_PRELOAD_INTERNAL_ID:
|
||||
assert(arch >= 9);
|
||||
return 59;
|
||||
return (arch >= 15) ? 2 : 59;
|
||||
case BI_PRELOAD_VERTEX_ID:
|
||||
return (arch >= 9) ? 60 : 61;
|
||||
return (arch >= 15) ? 0 : (arch >= 9) ? 60 : 61;
|
||||
case BI_PRELOAD_INSTANCE_ID:
|
||||
return (arch >= 9) ? 61 : 62;
|
||||
return (arch >= 15) ? 1 : (arch >= 9) ? 61 : 62;
|
||||
case BI_PRELOAD_DRAW_ID:
|
||||
assert(arch >= 9);
|
||||
return 62;
|
||||
return (arch >= 15) ? 3 : 62;
|
||||
case BI_PRELOAD_VIEW_ID:
|
||||
assert(arch >= 9);
|
||||
return 63;
|
||||
return (arch >= 15) ? 4 : 63;
|
||||
/* Fragment */
|
||||
case BI_PRELOAD_PRIMITIVE_ID:
|
||||
return 57;
|
||||
return (arch >= 15) ? 6 : 57;
|
||||
case BI_PRELOAD_PRIMITIVE_FLAGS:
|
||||
return 58;
|
||||
return (arch >= 15) ? 3 : 58;
|
||||
case BI_PRELOAD_POSITION_XY:
|
||||
return 59;
|
||||
return (arch >= 15) ? 2 : 59;
|
||||
case BI_PRELOAD_CUMULATIVE_COVERAGE:
|
||||
/* Bits [15;0] */
|
||||
return 60;
|
||||
return (arch >= 15) ? 0 : 60;
|
||||
case BI_PRELOAD_RASTERIZER_COVERAGE:
|
||||
/* Bits [15;0] */
|
||||
return 61;
|
||||
return (arch >= 15) ? 1 : 61;
|
||||
case BI_PRELOAD_SAMPLE_ID:
|
||||
/* Bits [23;16] */
|
||||
return 61;
|
||||
return (arch >= 15) ? 0 : 61;
|
||||
case BI_PRELOAD_CENTROID_ID:
|
||||
/* Bits [31;24] */
|
||||
return 61;
|
||||
return (arch >= 15) ? 0 : 61;
|
||||
case BI_PRELOAD_FRAME_ARG:
|
||||
/* Double reg */
|
||||
return 62;
|
||||
return (arch >= 15) ? 4 : 62;
|
||||
/* Blend */
|
||||
case BI_PRELOAD_BLEND_SRC0_C0:
|
||||
return 0;
|
||||
return (arch >= 15) ? 8 : 0;
|
||||
case BI_PRELOAD_BLEND_SRC0_C1:
|
||||
return 1;
|
||||
return (arch >= 15) ? 9 : 1;
|
||||
case BI_PRELOAD_BLEND_SRC0_C2:
|
||||
return 2;
|
||||
return (arch >= 15) ? 10 : 2;
|
||||
case BI_PRELOAD_BLEND_SRC0_C3:
|
||||
return 3;
|
||||
return (arch >= 15) ? 11 : 3;
|
||||
case BI_PRELOAD_BLEND_SRC1_C0:
|
||||
return 4;
|
||||
return (arch >= 15) ? 12 : 4;
|
||||
case BI_PRELOAD_BLEND_SRC1_C1:
|
||||
return 5;
|
||||
return (arch >= 15) ? 13 : 5;
|
||||
case BI_PRELOAD_BLEND_SRC1_C2:
|
||||
return 6;
|
||||
return (arch >= 15) ? 14 : 6;
|
||||
case BI_PRELOAD_BLEND_SRC1_C3:
|
||||
return 7;
|
||||
return (arch >= 15) ? 15 : 7;
|
||||
case BI_PRELOAD_BLEND_LINK:
|
||||
return 48;
|
||||
return (arch >= 15) ? 7 : 48;
|
||||
}
|
||||
UNREACHABLE("Non-handled BI_PRELOAD");
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -29,16 +29,20 @@ class FAUState:
|
|||
die_if(self.page is not None and self.page != page, 'Mismatched pages')
|
||||
self.page = page
|
||||
|
||||
def push(self, source):
|
||||
if not (source & (1 << 7)):
|
||||
# Skip registers
|
||||
def push(self, source, arch):
|
||||
# Skip registers
|
||||
if arch >= 15 and not (source & (1 << 8)):
|
||||
return
|
||||
elif arch < 15 and not (source & (1 << 7)):
|
||||
return
|
||||
|
||||
self.buffer.add(source)
|
||||
die_if(len(self.buffer) > 2, "Overflowed FAU buffer")
|
||||
|
||||
if (source >> 5) == 0b110:
|
||||
# Small constants need to check if the buffer overflows but no else
|
||||
# Small constants need to check if the buffer overflows but no else
|
||||
if arch >= 15 and (source >> 5) == 0b1110:
|
||||
return
|
||||
elif arch < 15 and (source >> 5) == 0b110:
|
||||
return
|
||||
|
||||
slot = (source >> 1)
|
||||
|
|
@ -120,6 +124,50 @@ def encode_source(op, fau):
|
|||
|
||||
die('Invalid operand')
|
||||
|
||||
def encode_source_v15(op, fau):
|
||||
# Reg tuple
|
||||
if op[0] == '[' and op[-1:] == ']':
|
||||
# Remove brackets and split on ":"
|
||||
unpacked = op[1:-1].split(":")
|
||||
die_if(len(unpacked) != 2, 'Invalid tuple')
|
||||
die_if(unpacked[0][0] != 'r', 'Invalid tuple')
|
||||
die_if(unpacked[1][0] != 'r', 'Invalid tuple')
|
||||
if (unpacked[0][-1:] == '^'):
|
||||
val0 = parse_int(unpacked[0][1:-1], 0, 127)
|
||||
val1 = parse_int(unpacked[1][1:-1], 0, 127)
|
||||
die_if(val1 != val0 + 1, 'Invalid tuple value')
|
||||
return val0 | 0x80
|
||||
else:
|
||||
val0 = parse_int(unpacked[0][1:], 0, 127)
|
||||
val1 = parse_int(unpacked[1][1:], 0, 127)
|
||||
die_if(val1 != val0 + 1, 'Invalid tuple value')
|
||||
return val0
|
||||
elif op[0] == 'r':
|
||||
if (op[-1:] == '^'):
|
||||
return parse_int(op[1:-1], 0, 127) | 0x80
|
||||
return parse_int(op[1:], 0, 127)
|
||||
elif op[0] == 'u':
|
||||
val = parse_int(op[1:], 0, 254)
|
||||
fau.set_page(val >> 6)
|
||||
return ((val & 0x3F) << 1) | 0x100
|
||||
elif op[0] == 'i':
|
||||
return int(op[3:]) | 0x1C0
|
||||
elif op.startswith('0x'):
|
||||
try:
|
||||
val = int(op, base=0)
|
||||
except ValueError:
|
||||
die('Expected value')
|
||||
|
||||
die_if(val not in immediates, 'Unexpected immediate value')
|
||||
return immediates.index(val) | 0x1C0
|
||||
else:
|
||||
for i in [0, 1, 3]:
|
||||
if op in enums[f'fau_special_page_{i}'].bare_values:
|
||||
idx = 32 + (enums[f'fau_special_page_{i}'].bare_values.index(op) << 1)
|
||||
fau.set_page(i)
|
||||
return idx | 0x1E0
|
||||
|
||||
die('Invalid operand')
|
||||
|
||||
def encode_dest(op):
|
||||
# Reg tuple
|
||||
|
|
@ -156,7 +204,47 @@ def encode_dest(op):
|
|||
|
||||
return value | (wrmask << 6)
|
||||
|
||||
def parse_asm(line):
|
||||
def encode_dest_v15(op, dst64):
|
||||
# Reg tuple
|
||||
if op[0] == '[' and op[-1:] == ']':
|
||||
# Remove brackets and split on ":"
|
||||
unpacked = op[1:-1].split(":")
|
||||
die_if(len(unpacked) != 2, 'Invalid tuple')
|
||||
die_if(unpacked[0][0] != 'r', 'Invalid tuple')
|
||||
die_if(unpacked[1][0] != 'r', 'Invalid tuple')
|
||||
|
||||
parts = unpacked[0].split(".")
|
||||
reg = parts[0]
|
||||
value = parse_int(reg[1:], 0, 127)
|
||||
|
||||
parts1 = unpacked[1].split(".")
|
||||
reg1 = parts1[0]
|
||||
val1 = parse_int(reg1[1:], 0, 127)
|
||||
die_if(val1 != value + 1, 'Invalid tuple value')
|
||||
else:
|
||||
die_if(op[0] != 'r', f"Expected register destination {op}")
|
||||
parts = op.split(".")
|
||||
reg = parts[0]
|
||||
value = parse_int(reg[1:], 0, 127)
|
||||
|
||||
# Default to writing in full
|
||||
if (dst64):
|
||||
wrmask = 0x0
|
||||
die_if(len(parts) > 1, "Must write full")
|
||||
else:
|
||||
wrmask = 0x3
|
||||
|
||||
if len(parts) > 1:
|
||||
WMASKS = ["h0", "h1"]
|
||||
die_if(len(parts) > 2, "Too many modifiers")
|
||||
mask = parts[1];
|
||||
die_if(mask not in WMASKS, "Expected a write mask")
|
||||
wrmask = 1 << WMASKS.index(mask)
|
||||
|
||||
return value | (wrmask << 13)
|
||||
|
||||
|
||||
def parse_asm(line, arch):
|
||||
global LINE
|
||||
LINE = line # For better errors
|
||||
encoded = 0
|
||||
|
|
@ -187,7 +275,7 @@ def parse_asm(line):
|
|||
|
||||
tail = line[(len(head) + 1):]
|
||||
operands = [x.strip() for x in tail.split(",") if len(x.strip()) > 0]
|
||||
expected_op_count = len(ins.srcs) + len(ins.dests) + len(ins.immediates) + len(ins.staging)
|
||||
expected_op_count = len(ins.srcs) + len(ins.dests) + len((ins.immediates_v15 if arch >= 15 else ins.immediates)) + len(ins.staging)
|
||||
if len(operands) != expected_op_count:
|
||||
die(f"Wrong number of operands in {line}, expected {expected_op_count}, got {len(operands)} {operands}")
|
||||
|
||||
|
|
@ -200,9 +288,9 @@ def parse_asm(line):
|
|||
parts = []
|
||||
|
||||
die_if(any([x[0] != 'r' for x in parts]), f'Expected registers, got {op}')
|
||||
regs = [parse_int(x[1:], 0, 63) for x in parts]
|
||||
regs = [parse_int(x[1:], 0, (127 if arch >= 15 else 63)) for x in parts]
|
||||
|
||||
extended_write = "staging_register_write_count" in [x.name for x in ins.modifiers] and sr.write
|
||||
extended_write = "staging_register_write_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)] and sr.write
|
||||
max_sr_count = 8 if extended_write else 7
|
||||
|
||||
sr_count = len(regs)
|
||||
|
|
@ -215,22 +303,31 @@ def parse_asm(line):
|
|||
'Consecutive staging registers must be aligned to a register pair')
|
||||
|
||||
if sr.count == 0:
|
||||
if "staging_register_write_count" in [x.name for x in ins.modifiers] and sr.write:
|
||||
if "staging_register_write_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)] and sr.write:
|
||||
modifier_map["staging_register_write_count"] = sr_count - 1
|
||||
else:
|
||||
assert "staging_register_count" in [x.name for x in ins.modifiers]
|
||||
assert "staging_register_count" in [x.name for x in (ins.modifiers_v15 if arch >= 15 else ins.modifiers)]
|
||||
modifier_map["staging_register_count"] = sr_count
|
||||
else:
|
||||
die_if(sr_count != sr.count, f"Expected {sr.count} staging registers, got {sr_count}")
|
||||
|
||||
encoded |= ((sr.encoded_flags | base) << sr.start)
|
||||
encoded |= base << sr.start
|
||||
if arch >= 15:
|
||||
encoded |= sr.encoded_flags_v15 << sr.offset['flags_v15']
|
||||
else:
|
||||
encoded |= sr.encoded_flags << sr.offset['flags']
|
||||
|
||||
# On v15, some instructions require special sr_control values
|
||||
if arch >= 15 and ins.name == "BARRIER":
|
||||
encoded |= 0b10 << 38
|
||||
|
||||
operands = operands[len(ins.staging):]
|
||||
|
||||
for op, dest in zip(operands, ins.dests):
|
||||
encoded |= encode_dest(op) << 40
|
||||
encoded |= (encode_dest_v15(op, dest.size >= 64) if arch >= 15 else encode_dest(op)) << 40
|
||||
operands = operands[len(ins.dests):]
|
||||
|
||||
if len(ins.dests) == 0 and len(ins.staging) == 0:
|
||||
if arch < 15 and len(ins.dests) == 0 and len(ins.staging) == 0:
|
||||
# Set a placeholder writemask to prevent encoding faults
|
||||
encoded |= (0xC0 << 40)
|
||||
|
||||
|
|
@ -238,12 +335,18 @@ def parse_asm(line):
|
|||
|
||||
for i, (op, src) in enumerate(zip(operands, ins.srcs)):
|
||||
parts = op.split('.')
|
||||
encoded_src = encode_source(parts[0], fau)
|
||||
|
||||
# Require a word selection for special FAU values
|
||||
may_have_word_select = ((encoded_src >> 5) == 0b111)
|
||||
# or for regular FAU values
|
||||
may_have_word_select |= ((encoded_src >> 6) == 0b10)
|
||||
if (arch >= 15):
|
||||
encoded_src = encode_source_v15(parts[0], fau)
|
||||
# Require a word selection for special FAU values
|
||||
may_have_word_select = ((encoded_src >> 5) == 0b1111)
|
||||
# or for regular FAU values
|
||||
may_have_word_select |= ((encoded_src >> 7) == 0b10)
|
||||
else:
|
||||
encoded_src = encode_source(parts[0], fau)
|
||||
# Require a word selection for special FAU values
|
||||
may_have_word_select = ((encoded_src >> 5) == 0b111)
|
||||
# or for regular FAU values
|
||||
may_have_word_select |= ((encoded_src >> 6) == 0b10)
|
||||
|
||||
# Has a swizzle been applied yet?
|
||||
swizzled = False
|
||||
|
|
@ -251,7 +354,11 @@ def parse_asm(line):
|
|||
for mod in parts[1:]:
|
||||
# Encode the modifier
|
||||
if mod in src.offset and src.mask[mod] == 0x1:
|
||||
encoded |= (1 << src.offset[mod])
|
||||
# On v15, FMA_RSCALE has a different offset src2.neg
|
||||
if arch >= 15 and ins.name[:10] == "FMA_RSCALE" and mod == "neg" and i == 2:
|
||||
encoded |= (1 << (src.offset[mod] + 1))
|
||||
else:
|
||||
encoded |= (1 << src.offset[mod])
|
||||
elif src.halfswizzle and mod in enums[f'half_swizzles_{src.size}_bit'].bare_values:
|
||||
die_if(swizzled, "Multiple swizzles specified")
|
||||
swizzled = True
|
||||
|
|
@ -318,12 +425,15 @@ def parse_asm(line):
|
|||
val = enums['swizzles_16_bit'].bare_values.index(mod)
|
||||
encoded |= (val << src.offset['widen'])
|
||||
|
||||
encoded |= encoded_src << src.start
|
||||
fau.push(encoded_src)
|
||||
if arch >= 15:
|
||||
encoded |= ((encoded_src & 0x100) << (src.offset['high1_v15'] - 8)) | ((encoded_src & 0xFF) << src.start)
|
||||
else:
|
||||
encoded |= encoded_src << src.start
|
||||
fau.push(encoded_src, arch)
|
||||
|
||||
operands = operands[len(ins.srcs):]
|
||||
|
||||
for i, (op, imm) in enumerate(zip(operands, ins.immediates)):
|
||||
for i, (op, imm) in enumerate(zip(operands, (ins.immediates_v15 if arch >= 15 else ins.immediates))):
|
||||
if op[0] == '#':
|
||||
die_if(imm.name != 'constant', "Wrong syntax for immediate")
|
||||
parts = [imm.name, op[1:]]
|
||||
|
|
@ -347,15 +457,15 @@ def parse_asm(line):
|
|||
|
||||
encoded |= (val << imm.start)
|
||||
|
||||
operands = operands[len(ins.immediates):]
|
||||
operands = operands[len((ins.immediates_v15 if arch >= 15 else ins.immediates)):]
|
||||
|
||||
# Encode the operation itself
|
||||
for subcode in ins.opcode:
|
||||
for subcode in (ins.opcode_v15 if arch >= 15 else ins.opcode):
|
||||
encoded |= (subcode.value << subcode.start)
|
||||
|
||||
# Encode FAU page
|
||||
if fau.page:
|
||||
encoded |= (fau.page << ins.offset['fau_page'])
|
||||
encoded |= (fau.page << (ins.offset['fau_page_v15'] if arch >= 15 else ins.offset['fau_page']))
|
||||
|
||||
# Encode modifiers
|
||||
has_flow = False
|
||||
|
|
@ -366,9 +476,10 @@ def parse_asm(line):
|
|||
if mod in enums['flow'].bare_values:
|
||||
die_if(has_flow, "Multiple flow control modifiers specified")
|
||||
has_flow = True
|
||||
encoded |= (enums['flow'].bare_values.index(mod) << ins.offset['flow'])
|
||||
encoded |= (enums['flow'].bare_values.index(mod) << (ins.offset['flow_v15'] if arch >= 15 else
|
||||
ins.offset['flow']))
|
||||
else:
|
||||
candidates = [c for c in ins.modifiers if mod in c.bare_values]
|
||||
candidates = [c for c in (ins.modifiers_v15 if arch >= 15 else ins.modifiers) if mod in c.bare_values]
|
||||
|
||||
die_if(len(candidates) == 0, f"Invalid modifier {mod} used")
|
||||
assert(len(candidates) == 1) # No ambiguous modifiers
|
||||
|
|
@ -380,13 +491,20 @@ def parse_asm(line):
|
|||
die_if(opts.name in modifier_map, f"{opts.name} specified twice")
|
||||
modifier_map[opts.name] = value
|
||||
|
||||
for mod in ins.modifiers:
|
||||
|
||||
for mod in (ins.modifiers_v15 if arch >= 15 else ins.modifiers):
|
||||
value = modifier_map.get(mod.name, mod.default)
|
||||
die_if(value is None, f"Missing required modifier {mod.name}")
|
||||
|
||||
assert(value < (1 << mod.size))
|
||||
encoded |= (value << mod.start)
|
||||
|
||||
# On v15, some instrutions require an encoded null src.
|
||||
requires_nullsrc = ['BARRIER', 'NOP', 'LD_GCLK_U64', 'LD_VAR_FLAT_IMM', 'LD_VAR_BUF_FLAT_IMM'];
|
||||
if arch >= 15 and ins.name in requires_nullsrc:
|
||||
enc_src = 0x1C0
|
||||
encoded |= ((enc_src >> 8) & 0x1) << 48 | (enc_src & 0xFF)
|
||||
|
||||
return encoded
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -28,6 +28,10 @@ template = """
|
|||
#define VA_SRC_UNIFORM_TYPE 0x2
|
||||
#define VA_SRC_IMM_TYPE 0x3
|
||||
|
||||
#define VA_SRC_V15_MODE1 BIT(8)
|
||||
#define VA_SRC_V15_MODE2 BIT(7)
|
||||
#define VA_SRC_V15_MODE4 BIT(5)
|
||||
|
||||
% for name, en in ENUMS.items():
|
||||
UNUSED static const char *valhall_${name}[] = {
|
||||
% for v in en.values:
|
||||
|
|
@ -91,22 +95,84 @@ va_print_float_src(FILE *fp, unsigned type, unsigned value, unsigned size, unsig
|
|||
fprintf(fp, ".abs");
|
||||
}
|
||||
|
||||
static inline void
|
||||
va_print_src_v15(FILE *fp, unsigned high1, unsigned low8, unsigned size, unsigned fau_page)
|
||||
{
|
||||
unsigned src = (high1 << 8) | low8;
|
||||
|
||||
/* Not reg */
|
||||
if (src & VA_SRC_V15_MODE1) {
|
||||
/* Not uniform */
|
||||
if (src & VA_SRC_V15_MODE2) {
|
||||
/* FAU special */
|
||||
if (src & VA_SRC_V15_MODE4) {
|
||||
unsigned value = src & MASK(5);
|
||||
if (fau_page == 0)
|
||||
fputs(valhall_fau_special_page_0[value >> 1] + 1, fp);
|
||||
else if (fau_page == 1)
|
||||
fputs(valhall_fau_special_page_1[value >> 1] + 1, fp);
|
||||
else if (fau_page == 3)
|
||||
fputs(valhall_fau_special_page_3[value >> 1] + 1, fp);
|
||||
else
|
||||
fprintf(fp, "reserved_page2");
|
||||
|
||||
fprintf(fp, ".w%u", value & 1);
|
||||
}
|
||||
/* Imm */
|
||||
else {
|
||||
unsigned value = src & MASK(5);
|
||||
assert(value < 32 && "overflow in LUT");
|
||||
fprintf(fp, "0x%X", va_immediates[value]);
|
||||
}
|
||||
}
|
||||
/* Uniform */
|
||||
else {
|
||||
unsigned value = src & MASK(7);
|
||||
fprintf(fp, "u%u", value >> 1 | (fau_page << 6));
|
||||
if (size <= 32)
|
||||
fprintf(fp, ".w%u", value & 1);
|
||||
}
|
||||
}
|
||||
/* Reg */
|
||||
else {
|
||||
unsigned value = src & MASK(7);
|
||||
bool discard = (src & BIT(7));
|
||||
char *dmark = discard ? "^" : "";
|
||||
if (size > 32)
|
||||
fprintf(fp, "[r%u%s:r%u%s]", value, dmark, value + 1, dmark);
|
||||
else
|
||||
fprintf(fp, "r%u%s", value, dmark);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void
|
||||
va_print_float_src_v15(FILE *fp, unsigned high1, unsigned low8, unsigned size, unsigned fau_page, bool neg, bool abs)
|
||||
{
|
||||
va_print_src_v15(fp, high1, low8, size, fau_page);
|
||||
|
||||
if (neg)
|
||||
fprintf(fp, ".neg");
|
||||
|
||||
if (abs)
|
||||
fprintf(fp, ".abs");
|
||||
}
|
||||
|
||||
static inline void
|
||||
va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
|
||||
{
|
||||
if (size > 32)
|
||||
fprintf(fp, "[r%u:r%u]", value, value + 1);
|
||||
else
|
||||
else {
|
||||
fprintf(fp, "r%u", value);
|
||||
|
||||
if (mask != 0x3)
|
||||
fprintf(fp, ".h%u", (mask == 1) ? 0 : 1);
|
||||
if (mask != 0x3)
|
||||
fprintf(fp, ".h%u", (mask == 1) ? 0 : 1);
|
||||
}
|
||||
}
|
||||
|
||||
<%def name="print_instr(op)">
|
||||
<%def name="print_instr(op, v15)">
|
||||
<% no_comma = True %>
|
||||
fputs("${op.name}", fp);
|
||||
% for mod in op.modifiers:
|
||||
% for mod in (op.modifiers_v15 if v15 else op.modifiers):
|
||||
% if mod.name not in ["staging_register_count", "staging_register_write_count"]:
|
||||
% if mod.is_enum:
|
||||
fputs(valhall_${safe_name(mod.enum)}[(instr >> ${mod.start}) & ${hex((1 << mod.size) - 1)}], fp);
|
||||
|
|
@ -115,10 +181,18 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
|
|||
% endif
|
||||
% endif
|
||||
% endfor
|
||||
% if v15:
|
||||
fprintf(fp, "%s ", valhall_flow[(instr >> ${op.offset['flow_v15']}) & ${hex(op.mask['flow_v15'])}]);
|
||||
% else:
|
||||
fprintf(fp, "%s ", valhall_flow[(instr >> ${op.offset['flow']}) & ${hex(op.mask['flow'])}]);
|
||||
% endif
|
||||
% for i, dest in enumerate(op.dests):
|
||||
<% no_comma = False %>
|
||||
% if v15:
|
||||
va_print_dest(fp, (instr >> ${dest.offset['mode_v15']}) & ${hex(dest.mask['mode_v15'])}, (instr >> ${dest.offset['value_v15']}) & ${hex(dest.mask['value_v15'])}, ${dest.size});
|
||||
% else:
|
||||
va_print_dest(fp, (instr >> ${dest.offset['mode']}) & ${hex(dest.mask['mode'])}, (instr >> ${dest.offset['value']}) & ${hex(dest.mask['value'])}, ${dest.size});
|
||||
% endif
|
||||
% endfor
|
||||
% for index, sr in enumerate(op.staging):
|
||||
% if not no_comma:
|
||||
|
|
@ -130,13 +204,12 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
|
|||
if sr.count != 0:
|
||||
sr_count = sr.count;
|
||||
else:
|
||||
for mod in op.modifiers:
|
||||
for mod in (op.modifiers_v15 if v15 else op.modifiers):
|
||||
if mod.name == "staging_register_write_count" and sr.write:
|
||||
sr_count = f"(((instr >> {mod.start}) & {hex((1 << mod.size) - 1)}) + 1)";
|
||||
elif mod.name == "staging_register_count":
|
||||
sr_count = f"((instr >> {mod.start}) & {hex((1 << mod.size) - 1)})";
|
||||
%>
|
||||
// assert(((instr >> ${sr.start}) & 0xC0) == ${sr.encoded_flags});
|
||||
fprintf(fp, "@");
|
||||
for (unsigned i = 0; i < ${sr_count}; ++i) {
|
||||
fprintf(fp, "%sr%u", (i == 0) ? "" : ":",
|
||||
|
|
@ -148,6 +221,28 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
|
|||
fputs(", ", fp);
|
||||
% endif
|
||||
<% no_comma = False %>
|
||||
% if v15:
|
||||
% if src.absneg:
|
||||
va_print_float_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${hex(src.mask['high1_v15'])}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])},
|
||||
${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])},
|
||||
% if op.name[:4] == "FMA." and i == 0:
|
||||
false,
|
||||
instr & BIT(${src.offset['abs']}));
|
||||
% elif op.name[:10] == "FMA_RSCALE" and i == 2:
|
||||
instr & BIT(${src.offset['neg'] + 1}),
|
||||
false);
|
||||
% else:
|
||||
instr & BIT(${src.offset['neg']}),
|
||||
instr & BIT(${src.offset['abs']}));
|
||||
% endif
|
||||
% elif src.is_float:
|
||||
va_print_float_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${src.mask['high1_v15']}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])},
|
||||
${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])}, false, false);
|
||||
% else:
|
||||
va_print_src_v15(fp, (instr >> ${src.offset['high1_v15']}) & ${src.mask['high1_v15']}, (instr >> ${src.offset['low8_v15']}) & ${hex(src.mask['low8_v15'])},
|
||||
${src.size}, (instr >> ${op.offset['fau_page_v15']}) & ${hex(op.mask['fau_page_v15'])});
|
||||
% endif
|
||||
% else:
|
||||
% if src.absneg:
|
||||
va_print_float_src(fp, (instr >> ${src.offset['mode']}) & ${hex(src.mask['mode'])}, (instr >> ${src.offset['value']}) & ${hex(src.mask['value'])},
|
||||
${src.size}, (instr >> ${op.offset['fau_page']}) & ${hex(op.mask['fau_page'])},
|
||||
|
|
@ -160,6 +255,7 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
|
|||
va_print_src(fp, (instr >> ${src.offset['mode']}) & ${src.mask['mode']}, (instr >> ${src.offset['value']}) & ${hex(src.mask['value'])},
|
||||
${src.size}, (instr >> ${op.offset['fau_page']}) & ${hex(op.mask['fau_page'])});
|
||||
% endif
|
||||
% endif
|
||||
% if src.swizzle:
|
||||
% if src.size == 32:
|
||||
fputs(valhall_widen[(instr >> ${src.offset['swizzle']}) & ${hex(src.mask['swizzle'])}], fp);
|
||||
|
|
@ -183,7 +279,7 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
|
|||
if (instr & BIT(${src.offset['not']})) fputs(".not", fp);
|
||||
% endif
|
||||
% endfor
|
||||
% for imm in op.immediates:
|
||||
% for imm in (op.immediates_v15 if v15 else op.immediates):
|
||||
<%
|
||||
prefix = "#" if imm.name == "constant" else imm.name + ":"
|
||||
fmt = "%d" if imm.signed else "0x%X"
|
||||
|
|
@ -192,16 +288,16 @@ va_print_dest(FILE *fp, unsigned mask, unsigned value, unsigned size)
|
|||
% endfor
|
||||
</%def>
|
||||
|
||||
<%def name="recurse_subcodes(op_bucket)">
|
||||
<%def name="recurse_subcodes(op_bucket, v15)">
|
||||
%if op_bucket.instr:
|
||||
${print_instr(op_bucket.instr)}
|
||||
${print_instr(op_bucket.instr, v15)}
|
||||
%else:
|
||||
opcode = (instr >> ${op_bucket.start}) & ${hex(op_bucket.mask)};
|
||||
switch (opcode) {
|
||||
%for op in op_bucket.children:
|
||||
case ${hex(op)}:
|
||||
{
|
||||
${recurse_subcodes(op_bucket.children[op])}
|
||||
${recurse_subcodes(op_bucket.children[op], v15)}
|
||||
break;
|
||||
}
|
||||
%endfor
|
||||
|
|
@ -215,7 +311,15 @@ va_disasm_instr(FILE *fp, uint64_t instr)
|
|||
{
|
||||
unsigned opcode;
|
||||
|
||||
${recurse_subcodes(OPCODES)}
|
||||
${recurse_subcodes(OPCODES, False)}
|
||||
}
|
||||
|
||||
void
|
||||
va_disasm_instr_v15(FILE *fp, uint64_t instr)
|
||||
{
|
||||
unsigned opcode;
|
||||
|
||||
${recurse_subcodes(OPCODES_V15, True)}
|
||||
}
|
||||
|
||||
static bool is_branch(uint64_t instr)
|
||||
|
|
@ -229,8 +333,19 @@ static bool is_branch(uint64_t instr)
|
|||
return false;
|
||||
}
|
||||
|
||||
static bool is_branch_v15(uint64_t instr)
|
||||
{
|
||||
<% (exact, mask) = OPCODES_V15.get_exact_mask("BRANCHZ") %>
|
||||
if ((instr & ${hex(mask)}) == ${hex(exact)})
|
||||
return true;
|
||||
<% (exact, mask) = OPCODES_V15.get_exact_mask("BRANCHZI") %>
|
||||
if ((instr & ${hex(mask)}) == ${hex(exact)})
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void
|
||||
disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose)
|
||||
disassemble_valhall(FILE *fp, const void *code, size_t size, unsigned arch, bool verbose)
|
||||
{
|
||||
assert((size & 7) == 0);
|
||||
|
||||
|
|
@ -256,11 +371,18 @@ disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose)
|
|||
fprintf(fp, " ");
|
||||
}
|
||||
|
||||
va_disasm_instr(fp, instr);
|
||||
bool instr_is_branch;
|
||||
if (arch >= 15) {
|
||||
va_disasm_instr_v15(fp, instr);
|
||||
instr_is_branch = is_branch_v15(instr);
|
||||
} else {
|
||||
va_disasm_instr(fp, instr);
|
||||
instr_is_branch = is_branch(instr);
|
||||
}
|
||||
fprintf(fp, "\\n");
|
||||
|
||||
/* Separate blocks visually by inserting whitespace after branches */
|
||||
if (is_branch(instr))
|
||||
if (instr_is_branch)
|
||||
fprintf(fp, "\\n");
|
||||
}
|
||||
|
||||
|
|
@ -276,6 +398,9 @@ class OpBucket:
|
|||
self.children = {}
|
||||
|
||||
def insert(self, subcodes, ins):
|
||||
# Need an early return in case of removed instructions
|
||||
if subcodes is None:
|
||||
return
|
||||
if len(subcodes) == 0:
|
||||
self.instr = ins
|
||||
else:
|
||||
|
|
@ -305,10 +430,12 @@ class OpBucket:
|
|||
|
||||
# Build opcode hierarchy:
|
||||
OPCODES = OpBucket()
|
||||
OPCODES_V15 = OpBucket()
|
||||
for ins in instructions:
|
||||
OPCODES.insert(ins.opcode, ins)
|
||||
OPCODES_V15.insert(ins.opcode_v15, ins)
|
||||
|
||||
try:
|
||||
print(Template(template).render(OPCODES = OPCODES, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name))
|
||||
print(Template(template).render(OPCODES = OPCODES, OPCODES_V15 = OPCODES_V15, IMMEDIATES = immediates, ENUMS = enums, typesize = typesize, safe_name = safe_name))
|
||||
except:
|
||||
print(exceptions.text_error_template().render())
|
||||
|
|
|
|||
|
|
@ -15,6 +15,8 @@
|
|||
#include <string.h>
|
||||
|
||||
void va_disasm_instr(FILE *fp, uint64_t instr);
|
||||
void disassemble_valhall(FILE *fp, const void *code, size_t size, bool verbose);
|
||||
void va_disasm_instr_v15(FILE *fp, uint64_t instr);
|
||||
void disassemble_valhall(FILE *fp, const void *code, size_t size, unsigned arch,
|
||||
bool verbose);
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -44,9 +44,7 @@ libpanfrost_valhall_disasm = static_library(
|
|||
)
|
||||
|
||||
if with_tests
|
||||
test(
|
||||
'valhall_disasm',
|
||||
executable(
|
||||
valhall_disasm_test_e = executable(
|
||||
'valhall_disasm_test',
|
||||
files('test/test-disassembler.c'),
|
||||
c_args : [c_msvc_compat_args, no_override_init_args],
|
||||
|
|
@ -54,15 +52,33 @@ if with_tests
|
|||
include_directories : [inc_include, inc_src],
|
||||
dependencies: [idep_valhall_enums_h],
|
||||
link_with : [libpanfrost_valhall_disasm],
|
||||
),
|
||||
)
|
||||
|
||||
test(
|
||||
'valhall_disasm',
|
||||
valhall_disasm_test_e,
|
||||
suite : ['panfrost'],
|
||||
args : files('test/assembler-cases.txt'),
|
||||
args : [files('test/assembler-cases.txt'), 'v10'],
|
||||
)
|
||||
|
||||
test(
|
||||
'valhall_disasm',
|
||||
valhall_disasm_test_e,
|
||||
suite : ['panfrost'],
|
||||
args : [files('test/assembler-cases-v15.txt'), 'v15'],
|
||||
)
|
||||
|
||||
test(
|
||||
'valhall_asm',
|
||||
prog_python,
|
||||
args : files('test-assembly.py', 'test/assembler-cases.txt', 'test/negative-cases.txt'),
|
||||
args : [files('test-assembly.py', 'test/assembler-cases.txt', 'test/negative-cases.txt'), 'v10'],
|
||||
suite : ['panfrost'],
|
||||
)
|
||||
|
||||
test(
|
||||
'valhall_asm',
|
||||
prog_python,
|
||||
args : [files('test-assembly.py', 'test/assembler-cases-v15.txt', 'test/negative-cases.txt'), 'v15'],
|
||||
suite : ['panfrost'],
|
||||
)
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -17,19 +17,19 @@ def hex_8(u64):
|
|||
return ' '.join(as_strings)
|
||||
|
||||
# These should not throw exceptions
|
||||
def positive_test(machine, assembly):
|
||||
def positive_test(machine, assembly, arch):
|
||||
try:
|
||||
expected = parse_hex_8(machine)
|
||||
val = parse_asm(assembly)
|
||||
val = parse_asm(assembly, arch)
|
||||
if val != expected:
|
||||
return f"{hex_8(val)} Incorrect assembly"
|
||||
except ParseError as exc:
|
||||
return f"Unexpected exception: {exc}"
|
||||
|
||||
# These should throw exceptions
|
||||
def negative_test(assembly):
|
||||
def negative_test(assembly, arch):
|
||||
try:
|
||||
parse_asm(assembly)
|
||||
parse_asm(assembly, arch)
|
||||
return "Expected exception"
|
||||
except Exception:
|
||||
return None
|
||||
|
|
@ -43,24 +43,34 @@ def record_case(case, error):
|
|||
else:
|
||||
FAIL.append((case, error))
|
||||
|
||||
if len(sys.argv) < 3:
|
||||
print("Expected positive and negative case lists")
|
||||
if len(sys.argv) < 4:
|
||||
print("Expected positive and negative case lists, followed by arch")
|
||||
sys.exit(1)
|
||||
|
||||
if sys.argv[3][0] == 'v':
|
||||
try:
|
||||
arch = int(sys.argv[3][1:], base = 0)
|
||||
except ValueError:
|
||||
print(f"Expected arch number {sys.argv[3][1:]}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"Expected arch version {sys.argv[3]}")
|
||||
|
||||
|
||||
with open(sys.argv[1], "r") as f:
|
||||
cases = f.read().split('\n')
|
||||
cases = [x for x in cases if len(x) > 0 and x[0] != '#']
|
||||
|
||||
for case in cases:
|
||||
(machine, assembly) = case.split(' ')
|
||||
record_case(case, positive_test(machine, assembly))
|
||||
record_case(case, positive_test(machine, assembly, arch))
|
||||
|
||||
with open(sys.argv[2], "r") as f:
|
||||
cases = f.read().split('\n')
|
||||
cases = [x for x in cases if len(x) > 0]
|
||||
|
||||
for case in cases:
|
||||
record_case(case, negative_test(case))
|
||||
record_case(case, negative_test(case, arch))
|
||||
|
||||
print("Passed {}/{} tests.".format(len(PASS), len(PASS) + len(FAIL)))
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,195 @@
|
|||
02 00 20 00 00 01 60 00 MOV.i32 r1, r2
|
||||
0a 00 20 00 00 01 61 00 MOV.i32 r1, u5.w0
|
||||
e3 00 20 00 00 01 61 40 MOV.i32 r1, thread_local_pointer.w1
|
||||
e6 00 20 00 00 01 61 40 MOV.i32 r1, workgroup_local_pointer.w0
|
||||
e2 00 20 00 00 01 61 c0 MOV.i32 r1, lane_id.w0
|
||||
e6 00 20 00 00 01 61 c0 MOV.i32 r1, core_id.w0
|
||||
01 02 00 00 00 00 f0 00 FADD.f32 r0, r1, r2
|
||||
01 02 00 00 20 00 f0 00 FADD.f32 r0, r1, r2.abs
|
||||
01 02 00 00 10 00 f0 00 FADD.f32 r0, r1, r2.neg
|
||||
01 02 00 00 30 00 f0 00 FADD.f32 r0, r1, r2.neg.abs
|
||||
01 02 00 80 30 00 f0 00 FADD.f32.clamp_m1_1 r0, r1, r2.neg.abs
|
||||
81 03 00 00 00 00 b8 2a BRANCHZ.reconverge r1^, offset:3
|
||||
01 d0 00 00 00 00 f2 00 FADD.f32 r0, r1, 0x3F800000
|
||||
01 d0 00 00 10 00 f2 00 FADD.f32 r0, r1, 0x3F800000.neg
|
||||
01 c0 00 00 00 00 f2 00 FADD.f32 r0, r1, 0x0
|
||||
01 c0 00 00 10 00 f2 00 FADD.f32 r0, r1, 0x0.neg
|
||||
01 c9 00 00 00 00 e2 00 IADD.u32 r0, r1, 0x7060504
|
||||
01 00 00 08 00 00 f0 00 FADD.f32 r0, r1, r0.h1
|
||||
01 00 00 04 00 00 f0 00 FADD.f32 r0, r1, r0.h0
|
||||
01 00 00 0c 00 00 f4 00 FADD.v2f16 r0, r1.h00, r0.h11
|
||||
01 00 00 28 00 00 f4 00 FADD.v2f16 r0, r1, r0
|
||||
01 00 00 24 00 00 f4 00 FADD.v2f16 r0, r1, r0.h10
|
||||
01 02 00 08 00 00 e0 00 IADD.u32 r0, r1, r2.h0
|
||||
01 02 00 0c 00 00 e0 00 IADD.u32 r0, r1, r2.h1
|
||||
01 02 00 0c 70 00 e0 00 IADD.u32 r0, r1.b3, r2.h1
|
||||
01 c9 00 18 00 00 e2 00 IADD.u32 r0, r1, 0x7060504.b2
|
||||
01 02 00 08 20 00 e4 00 IADD.v2u16 r0, r1, r2
|
||||
02 3c 47 20 00 00 91 02 SHADDX.u64 [r0:r1], u1, [r60:r61].w0, shift:0x2
|
||||
80 00 00 00 19 00 20 07 LOAD.i32.slot0.wait0 @r0, [r0^:r1^], offset:0
|
||||
00 bc 87 20 00 00 91 02 SHADDX.u64 [r0:r1], u0, [r60^:r61^].w0, shift:0x4
|
||||
80 00 00 00 9c 04 20 3f STORE.i128.slot0.end @r4:r5:r6:r7, [r0^:r1^], offset:0
|
||||
c0 00 e0 01 00 00 a1 3e NOP.end
|
||||
80 c4 c0 1e 02 01 e6 01 ICMP_OR.u32.gt.m1 r1, r0^, 0x1000000.b3, 0x0
|
||||
82 00 00 00 99 00 20 2b STORE.i32.slot0.reconverge @r0, [r2^:r3^], offset:0
|
||||
00 c9 8f 12 30 00 e2 00 CLPER.i32.f1 r0, r0, 0x7060504.b00
|
||||
00 00 4b 00 00 02 60 00 F16_TO_F32 r2, r0.h0
|
||||
80 00 4b 10 00 03 60 00 F16_TO_F32 r3, r0^.h1
|
||||
c0 00 e0 01 00 00 a1 22 NOP.wait0126
|
||||
80 c0 00 28 90 00 f6 24 FADD.v2f16.wait r0, r0^.abs, 0x0.neg
|
||||
c0 00 00 00 00 36 6d 00 IADD_IMM.i32 r54, 0x0, #0x0
|
||||
3c d0 ea 00 01 3c d6 37 ATEST.discard @r60, r60, 0x3F800000, atest_datum.w0
|
||||
80 db 05 04 00 01 e6 00 MKVEC.v2i16 r1, r0^.h0, 0x3C000000.h1
|
||||
f0 00 3c 33 82 00 1b 3f BLEND.slot0.v4.f16.end @r0:r1, blend_descriptor_0.w0, r60, target:0x0
|
||||
bb 0d 00 40 02 04 08 07 LEA_BUF_IMM.slot1.wait0 @r4:r5, r59^, table:0xD, index:0x0
|
||||
00 dd c0 08 14 02 66 01 FMA.f32 r2, r0, 0x44000000.neg.h1, 0x0.neg
|
||||
81 08 c0 00 04 01 66 01 FMA.f32 r1, r1^, u4.w0, 0x0.neg
|
||||
80 08 c0 00 04 00 66 09 FMA.f32.wait1 r0, r0^, u4.w0, 0x0.neg
|
||||
84 00 00 02 93 00 20 3f STORE.i96.estream.slot0.end @r0:r1:r2, [r4^:r5^], offset:0
|
||||
84 00 00 01 9c 08 20 3f STORE.i128.istream.slot0.end @r8:r9:r10:r11, [r4^:r5^], offset:0
|
||||
c0 00 00 c0 80 00 3d 27 BARRIER.slot7.wait
|
||||
00 00 00 00 01 02 21 03 LOAD.i8.slot0 @r2, u0, offset:0
|
||||
00 00 00 00 09 02 21 03 LOAD.i16.slot0 @r2, u0, offset:0
|
||||
00 00 00 00 11 02 21 03 LOAD.i24.slot0 @r2, u0, offset:0
|
||||
00 00 00 00 19 02 21 03 LOAD.i32.slot0 @r2, u0, offset:0
|
||||
00 00 00 00 02 02 21 03 LOAD.i48.slot0 @r2:r3, u0, offset:0
|
||||
00 00 00 00 0a 02 21 03 LOAD.i64.slot0 @r2:r3, u0, offset:0
|
||||
00 00 00 00 13 02 21 03 LOAD.i96.slot0 @r2:r3:r4, u0, offset:0
|
||||
00 00 00 00 1c 04 21 03 LOAD.i128.slot0 @r4:r5:r6:r7, u0, offset:0
|
||||
00 00 00 08 01 02 21 03 LOAD.i8.b1.slot0 @r2, u0, offset:0
|
||||
00 00 00 10 01 02 21 03 LOAD.i8.b2.slot0 @r2, u0, offset:0
|
||||
00 00 00 18 01 02 21 03 LOAD.i8.b3.slot0 @r2, u0, offset:0
|
||||
00 00 00 00 09 02 21 03 LOAD.i16.slot0 @r2, u0, offset:0
|
||||
00 14 00 08 09 02 21 03 LOAD.i16.h1.slot0 @r2, u0, offset:20
|
||||
82 00 4d 00 42 02 60 00 FROUND.f32.rtn r2, r2^.neg
|
||||
82 00 4b 00 40 02 60 00 F16_TO_F32 r2, r2^.neg.h0
|
||||
82 00 4c 00 43 02 60 00 F32_TO_S32.rtz r2, r2^.neg
|
||||
82 c0 c6 47 48 02 64 00 FADD_IMM.f32 r2, r2^, #0x4847C6C0
|
||||
82 84 67 ac 70 02 62 00 FADD_IMM.v2f16 r2, r2^, #0x70AC6784
|
||||
82 14 00 13 00 02 6a 00 IADD_IMM.v2i16 r2, r2^, #0x130014
|
||||
82 ab 4b 00 00 02 6c 00 IADD_IMM.i32 r2, r2^, #0x4BAB
|
||||
83 82 c0 c6 12 02 e4 01 ICMP_OR.v2s16.gt.m1 r2, r3^.h10, r2^.h10, 0x0
|
||||
83 82 c0 52 03 02 e4 01 FCMP_OR.v2f16.gt.m1 r2, r3^.h10, r2^.h00, 0x0
|
||||
81 03 00 00 00 00 b8 2a BRANCHZ.reconverge r1^, offset:3
|
||||
00 03 00 00 20 00 b8 2a BRANCHZ.reconverge r0.h0, offset:3
|
||||
00 03 00 00 40 00 b8 2a BRANCHZ.reconverge r0.h1, offset:3
|
||||
00 03 00 00 00 00 b8 2a BRANCHZ.reconverge r0, offset:3
|
||||
c0 00 00 00 00 00 6d 00 IADD_IMM.i32 r0, 0x0, #0x0
|
||||
c0 01 00 00 00 04 6d 28 IADD_IMM.i32.reconverge r4, 0x0, #0x1
|
||||
00 00 47 20 00 02 91 02 SHADDX.u64 [r2:r3], u0, [r0:r1].w0, shift:0x2
|
||||
80 c9 00 10 00 00 e2 00 IADD.u32 r0, r0^, 0x7060504.b0
|
||||
00 02 c0 02 06 01 e6 01 ICMP_OR.u32.ne.m1 r1, r0, u1.w0, 0x0
|
||||
04 00 20 00 00 05 60 00 MOV.i32 r5, r4
|
||||
04 00 20 00 00 06 60 00 MOV.i32 r6, r4
|
||||
04 00 20 00 00 07 60 04 MOV.i32.wait0 r7, r4
|
||||
82 00 00 00 9c 04 20 03 STORE.i128.slot0 @r4:r5:r6:r7, [r2^:r3^], offset:0
|
||||
81 f8 ff ff 07 00 b8 2a BRANCHZ.reconverge r1^, offset:-8
|
||||
bd c0 00 08 10 3c c6 00 IADD.v2u16 r60.h1, r61^.h10, 0x0
|
||||
84 00 86 32 8c 00 12 3f ST_CVT.slot0.istream.v4.f32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0
|
||||
84 00 86 34 8c 00 12 3f ST_CVT.slot0.istream.v4.s32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0
|
||||
84 00 86 36 8c 00 12 3f ST_CVT.slot0.istream.v4.u32.end @r0:r1:r2:r3, [r4^:r5^], r6^, offset:0x0
|
||||
bc c0 12 00 2b 04 86 03 LEA_TEX_IMM.slot0 @r4:r5:r6, r60^, 0x0, table:0x2, index:0x1
|
||||
bc c0 02 00 2b 04 86 03 LEA_TEX_IMM.slot0 @r4:r5:r6, r60^, 0x0, table:0x2, index:0x0
|
||||
02 01 00 00 0a 02 8b 03 LD_PKA.i64.slot0 @r2:r3, u1.w0, u0.w1
|
||||
00 01 00 40 0a 00 8b 03 LD_PKA.i64.slot1 @r0:r1, u0.w0, u0.w1
|
||||
04 01 00 80 0a 26 8b 03 LD_PKA.i64.slot2 @r38:r39, u2.w0, u0.w1
|
||||
03 01 00 80 0a 24 8b 03 LD_PKA.i64.slot2 @r36:r37, u1.w1, u0.w1
|
||||
03 04 00 00 0a 02 8b 03 LD_PKA.i64.slot0 @r2:r3, u1.w1, u2.w0
|
||||
81 02 00 00 13 02 8a 03 LD_PKA.i96.slot0 @r2:r3:r4, r1^, u1.w0
|
||||
80 03 00 00 13 06 8a 07 LD_PKA.i96.slot0.wait0 @r6:r7:r8, r0^, u1.w1
|
||||
80 00 80 01 c0 00 60 20 FRCP.f32.wait0126 r0, r0^.neg.abs
|
||||
80 84 00 80 00 00 7c 01 MUX.i32.neg r0, r0^, r4^, u0.w0
|
||||
80 84 00 80 04 00 7c 01 MUX.i32 r0, r0^, r4^, u0.w0
|
||||
80 84 00 80 08 00 7c 01 MUX.i32.fp_zero r0, r0^, r4^, u0.w0
|
||||
80 84 00 80 0c 00 7c 01 MUX.i32.bit r0, r0^, r4^, u0.w0
|
||||
00 00 20 41 00 01 60 34 FREXPM.f32.sqrt.discard r1, r0
|
||||
01 00 82 01 00 02 60 00 FRSQ.f32 r2, r1
|
||||
80 00 22 41 00 00 60 00 FREXPE.f32.sqrt r0, r0^
|
||||
81 82 c0 80 0a 00 64 02 FMA_RSCALE.f32.clamp_m1_1 r0, r1^, r2^, 0x0.neg, r0^
|
||||
81 82 c0 80 0e 00 64 22 FMA_RSCALE.f32.left.wait0126 r0, r1^, r2^, 0x0.neg, r0^
|
||||
82 83 04 05 00 01 7c 02 CSEL.u32.eq r1, r2^, r3^, u2.w0, u2.w1
|
||||
82 83 04 05 08 01 7c 02 CSEL.u32.lt r1, r2^, r3^, u2.w0, u2.w1
|
||||
82 83 04 05 48 01 7c 02 CSEL.s32.lt r1, r2^, r3^, u2.w0, u2.w1
|
||||
3d 00 00 12 5a 02 18 07 LD_VAR_SPECIAL.v2.f32.sample.clobber.slot0.wait0 @r2:r3, r61, index:0x0
|
||||
3d 00 00 3f 0a 02 10 07 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.center.retrieve.wait0 @r2:r3, r61, index:0x0
|
||||
3d 00 00 3f 42 00 10 07 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.sample.store.wait0 @r0:r1, r61, index:0x0
|
||||
3d 08 00 3f 22 00 10 07 LD_VAR_BUF_IMM.f16.slot0.v4.src_f16.centroid.store.wait0 @r0:r1, r61, index:0x8
|
||||
bc bd 11 33 02 00 84 03 LD_ATTR_IMM.v4.f16.slot0 @r0:r1, r60^, r61^, index:0x1, table:0x1
|
||||
80 3c 03 23 02 04 c0 03 LD_TILE.v3.f16.slot0 @r4:r5, r0^, r60, r3
|
||||
00 c9 00 20 10 01 c6 00 IADD.v2u16 r1.h1, r0.h10, 0x7060504.b11
|
||||
80 c0 00 08 10 01 a6 00 IADD.v2u16 r1.h0, r0^.h10, 0x0
|
||||
02 02 00 04 20 02 a4 00 IADD.v2u16 r2.h0, r2, r2.h10
|
||||
82 c0 05 00 00 02 e6 00 MKVEC.v2i16 r2, r2^.h0, 0x0.h0
|
||||
b7 c0 05 00 00 02 e6 00 MKVEC.v2i16 r2, r55^.h0, 0x0.h0
|
||||
b7 c0 05 10 00 02 e6 00 MKVEC.v2i16 r2, r55^.h1, 0x0.h0
|
||||
c0 b7 05 00 00 02 e5 00 MKVEC.v2i16 r2, 0x0.h0, r55^.h0
|
||||
c0 b7 05 04 00 02 e5 00 MKVEC.v2i16 r2, 0x0.h0, r55^.h1
|
||||
b7 00 54 00 00 02 60 00 U16_TO_U32 r2, r55^.h0
|
||||
b7 00 54 10 00 02 60 00 U16_TO_U32 r2, r55^.h1
|
||||
b7 00 44 00 00 02 60 00 S16_TO_S32 r2, r55^.h0
|
||||
b7 00 44 10 00 02 60 00 S16_TO_S32 r2, r55^.h1
|
||||
c0 b7 01 08 00 02 e9 00 ISUB.s32 r2, 0x0, r55^.h0
|
||||
c0 b7 01 0c 00 02 e9 00 ISUB.s32 r2, 0x0, r55^.h1
|
||||
00 c0 c0 c0 c0 07 7e 01 MKVEC.v2i8 r7, r0.b3, 0x0.b0, 0x0
|
||||
00 c0 c0 c0 80 06 7e 01 MKVEC.v2i8 r6, r0.b2, 0x0.b0, 0x0
|
||||
00 c0 c0 c0 00 04 7e 01 MKVEC.v2i8 r4, r0.b0, 0x0.b0, 0x0
|
||||
80 c0 c0 c0 40 05 7e 01 MKVEC.v2i8 r5, r0^.b1, 0x0.b0, 0x0
|
||||
|
||||
3d 00 00 ba 44 00 10 37 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.discard @r0:r1:r2:r3, r61, index:0x0
|
||||
3d 10 00 7a 0c 04 10 03 LD_VAR_BUF_IMM.f32.slot1.v4.src_f32.center.retrieve @r4:r5:r6:r7, r61, index:0x10
|
||||
c0 00 00 00 00 08 6d 00 IADD_IMM.i32 r8, 0x0, #0x0
|
||||
c0 00 00 00 00 09 6d 00 IADD_IMM.i32 r9, 0x0, #0x0
|
||||
3d 00 54 00 00 0a 60 00 U16_TO_U32 r10, r61.h0
|
||||
3d 09 00 00 30 00 b8 2a BRANCHZ.eq.reconverge r61.h0, offset:9
|
||||
0a 00 20 00 00 0b 60 28 MOV.i32.reconverge r11, r10
|
||||
c0 00 e0 01 00 00 a1 26 NOP.wait
|
||||
01 0b 00 33 02 0e c5 03 LD_TILE.v4.f16.slot0 @r14:r15, u0.w1, r11, u0.w0
|
||||
0b 00 24 00 00 0c 60 00 CLZ.u32 r12, r11
|
||||
02 8c c0 10 06 0c 6d 01 RSHIFT_XOR.i32.not_result r12, u1.w0, r12^.b00, 0x0
|
||||
8b c0 8c 50 00 0b 6a 05 LSHIFT_AND.i32.wait0 r11, r11^, 0x0.b00, r12^
|
||||
8f 89 00 28 00 09 f4 00 FADD.v2f16 r9, r15^, r9^
|
||||
8e 88 00 28 00 08 f4 00 FADD.v2f16 r8, r14^, r8^
|
||||
0b f8 ff ff 07 00 b8 2a BRANCHZ.reconverge r11, offset:-8
|
||||
8a 00 2c 00 00 3e 60 00 POPCOUNT.i32 r62, r10^
|
||||
be 00 59 00 00 3e 60 00 U32_TO_F32 r62, r62^
|
||||
be 00 81 01 00 3e 60 00 FRCP.f16 r62, r62^.h00
|
||||
89 3e c0 22 44 09 64 19 FMA.v2f16.wait12 r9, r9^, r62.h00, 0x0.neg
|
||||
87 83 00 00 00 03 f0 00 FADD.f32 r3, r7^, r3^
|
||||
83 09 00 08 00 03 f0 20 FADD.f32.wait0126 r3, r3^, r9.h1
|
||||
3c 03 ea 00 01 3c d4 37 ATEST.discard @r60, r60, r3, atest_datum.w0
|
||||
86 82 00 00 00 02 f0 00 FADD.f32 r2, r6^, r2^
|
||||
84 80 00 00 00 00 f0 00 FADD.f32 r0, r4^, r0^
|
||||
88 be c0 22 44 3f 64 01 FMA.v2f16 r63, r8^, r62^.h00, 0x0.neg
|
||||
85 81 00 00 00 01 f0 00 FADD.f32 r1, r5^, r1^
|
||||
81 3f 00 08 00 01 f0 00 FADD.f32 r1, r1^, r63.h1
|
||||
80 bf 00 04 00 00 f0 00 FADD.f32 r0, r0^, r63^.h0
|
||||
82 89 00 04 00 02 f0 24 FADD.f32.wait r2, r2^, r9^.h0
|
||||
f0 00 3c 32 84 00 1b 3f BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
|
||||
c0 00 00 00 00 36 6d 00 IADD_IMM.i32 r54, 0x0, #0x0
|
||||
c0 f1 0f 80 10 00 b3 06 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1
|
||||
00 00 00 1f 5a 3c 69 03 TEX_FETCH.slot0.32.2d @r0:r1:r2:r3, @r60:r61, u0
|
||||
40 00 20 00 00 01 61 00 MOV.i32 r1, u32.w0
|
||||
41 00 20 00 00 01 61 00 MOV.i32 r1, u32.w1
|
||||
4a 00 20 00 00 01 61 00 MOV.i32 r1, u37.w0
|
||||
30 00 37 0f c1 0c 24 07 ATOM_RETURN.i32.slot0.axchg.wait0 @r55, @r12, [r48:r49], offset:0x0
|
||||
32 00 00 02 81 0c 2c 07 ATOM.i32.slot0.aadd.wait0 @r12, [r50:r51], offset:0x0
|
||||
32 00 00 00 01 0c 28 07 ATOM1_RETURN.i32.slot0.ainc.wait0 @r12, [r50:r51], offset:0x0
|
||||
32 00 00 00 01 00 28 07 ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, [r50:r51], offset:0x0
|
||||
02 00 00 11 da 00 d5 27 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0
|
||||
02 20 00 11 da 00 d5 07 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0
|
||||
02 20 00 11 c2 00 d5 23 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0126 @r0, u1, u0.w0
|
||||
80 c0 c0 02 06 00 e6 09 ICMP_OR.u32.ne.m1.wait1 r0, r0^, 0x0, 0x0
|
||||
82 83 80 80 02 00 e8 01 ICMP_AND.s32.gt.i1 r0, r2^, r3^, r0^
|
||||
82 c0 c0 03 06 00 f6 09 ICMP_MULTI.u32.ne.u1.wait1 r0, r2^, 0x0, 0x0
|
||||
84 86 c0 03 02 02 f4 01 ICMP_MULTI.u32.gt.u1 r2, r4^, r6^, 0x0
|
||||
85 87 82 02 02 02 f0 01 ICMP_MULTI.u32.gt.m1 r2, r5^, r7^, r2^
|
||||
83 c0 80 02 06 00 f2 01 ICMP_MULTI.u32.ne.m1 r0, r3^, 0x0, r0^
|
||||
80 82 c0 03 02 00 f4 01 ICMP_MULTI.u32.gt.u1 r0, r0^, r2^, 0x0
|
||||
81 83 80 82 02 04 f0 01 ICMP_MULTI.s32.gt.m1 r4, r1^, r3^, r0^
|
||||
80 c0 c0 6a 07 00 e6 09 FCMP_OR.v2f16.ne.m1.wait1 r0, r0^, 0x0, 0x0
|
||||
81 81 80 6e 03 00 e8 01 FCMP_AND.v2f16.gt.m1 r0, r1^, r1^.h11, r0^
|
||||
80 c0 c0 6a 07 00 e6 09 FCMP_OR.v2f16.ne.m1.wait1 r0, r0^, 0x0, 0x0
|
||||
81 81 80 6e 03 00 e8 01 FCMP_AND.v2f16.gt.m1 r0, r1^, r1^.h11, r0^
|
||||
c4 c0 80 52 70 00 6b 01 LSHIFT_AND.v4i8 r0, 0x1000000.b3333, 0x0.b00, r0^
|
||||
80 81 82 80 24 00 78 01 MUX.v4i8 r0, r0^, r1^, r2^
|
||||
c0 c0 00 00 02 02 8f 03 LEA_PKA.slot0 @r2:r3, 0x0, 0x0
|
||||
|
|
@ -126,6 +126,7 @@ c0 01 00 00 00 c4 10 51 IADD_IMM.i32.reconverge r4, 0x0, #0x1
|
|||
00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.discard r1, r0
|
||||
01 00 02 00 00 c2 9c 00 FRSQ.f32 r2, r1
|
||||
40 00 02 01 00 c0 99 00 FREXPE.f32.sqrt r0, r0^
|
||||
41 42 c0 40 06 c0 60 01 FMA_RSCALE.f32.clamp_m1_1 r0, r1^, r2^, 0x0.neg, r0^
|
||||
41 42 c0 40 04 c0 62 41 FMA_RSCALE_LEFT.f32.wait0126 r0, r1^, r2^, 0x0.neg, r0^
|
||||
42 43 84 85 00 c1 50 01 CSEL.u32.eq r1, r2^, r3^, u2.w0, u2.w1
|
||||
42 43 84 85 04 c1 50 01 CSEL.u32.lt r1, r2^, r3^, u2.w0, u2.w1
|
||||
|
|
@ -213,17 +214,17 @@ c0 00 00 00 00 c9 10 01 IADD_IMM.i32 r9, 0x0, #0x0
|
|||
f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
|
||||
c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0
|
||||
c0 f1 00 00 10 c1 2f 08 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1
|
||||
80 00 c0 17 34 7c 25 01 TEX_FETCH.slot0.f.32.2d @r0:r1:r2:r3, @r60:r61, u0
|
||||
80 00 c0 13 34 7c 25 01 TEX_FETCH.slot0.32.2d @r0:r1:r2:r3, @r60:r61, u0
|
||||
80 00 00 00 00 c1 91 02 MOV.i32 r1, u32.w0
|
||||
81 00 00 00 00 c1 91 02 MOV.i32 r1, u32.w1
|
||||
8a 00 00 00 00 c1 91 02 MOV.i32 r1, u37.w0
|
||||
30 00 f7 1b 02 cc 20 09 ATOM_RETURN.i32.slot0.axchg.wait0 @r55, @r12, [r48:r49], offset:0x0
|
||||
32 00 80 18 02 4c 68 08 ATOM.i32.slot0.aadd.wait0 @r12, [r50:r51], offset:0x0
|
||||
32 00 00 18 02 8c 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @r12, [r50:r51], offset:0x0
|
||||
32 00 00 18 00 80 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @, [r50:r51], offset:0x0
|
||||
82 00 80 15 b4 80 38 49 VAR_TEX_SINGLE.slot0.skip.sample_store.f.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0
|
||||
82 20 80 15 b4 80 38 09 VAR_TEX_SINGLE.slot0.skip.sample_store.f.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0
|
||||
82 20 80 1d 84 80 38 41 VAR_TEX_SINGLE.slot0.skip.sample_store.s.32.2d.computed.wait0126 @r0, u1, u0.w0
|
||||
32 00 00 18 02 80 69 08 ATOM1_RETURN.i32.slot0.ainc.wait0 @r0, [r50:r51], offset:0x0
|
||||
82 00 80 11 b4 80 38 49 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.zero.wait @r0:r1:r2:r3, u1, u0.w0
|
||||
82 20 80 11 b4 80 38 09 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0 @r0:r1:r2:r3, u1, u0.w0
|
||||
82 20 80 11 84 80 38 41 VAR_TEX_SINGLE.slot0.skip.sample_store.32.2d.computed.wait0126 @r0, u1, u0.w0
|
||||
40 c0 c0 80 03 c0 f0 10 ICMP_OR.u32.ne.m1.wait1 r0, r0^, 0x0, 0x0
|
||||
42 43 40 01 01 c0 f8 00 ICMP_AND.s32.gt.i1 r0, r2^, r3^, r0^
|
||||
42 c0 c0 c2 03 c0 f0 10 ICMP_MULTI.u32.ne.u1.wait1 r0, r2^, 0x0, 0x0
|
||||
|
|
|
|||
|
|
@ -33,8 +33,18 @@ parse_hex(const char *in)
|
|||
int
|
||||
main(int argc, const char **argv)
|
||||
{
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Expected case list\n");
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Expected case list and arch version\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argv[2][0] != 'v') {
|
||||
fprintf(stderr, "Invalid arch version: %s\n", argv[2]);
|
||||
return 1;
|
||||
}
|
||||
unsigned arch = atoi(&argv[2][1]);
|
||||
if (arch < 9 || arch > 15) {
|
||||
fprintf(stderr, "Non-supported arch version: %d\n", arch);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
@ -65,7 +75,10 @@ main(int argc, const char **argv)
|
|||
|
||||
uint64_t bin = parse_hex(line);
|
||||
FILE *outputp = open_memstream(&output, &sz);
|
||||
va_disasm_instr(outputp, bin);
|
||||
if (arch < 15)
|
||||
va_disasm_instr(outputp, bin);
|
||||
else
|
||||
va_disasm_instr_v15(outputp, bin);
|
||||
fprintf(outputp, "\n");
|
||||
fclose(outputp);
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@
|
|||
static inline void
|
||||
add_imm(bi_context *ctx)
|
||||
{
|
||||
ctx->arch = 10;
|
||||
struct hash_table_u64 *stats = _mesa_hash_table_u64_create(ctx);
|
||||
bi_foreach_instr_global(ctx, I) {
|
||||
va_lower_constants(ctx, I, stats, UINT32_MAX);
|
||||
|
|
|
|||
|
|
@ -26,7 +26,9 @@ strip_discard(bi_context *ctx)
|
|||
do { \
|
||||
void *mem_ctx = ralloc_context(NULL); \
|
||||
bi_builder *A = bit_builder(mem_ctx); \
|
||||
A->shader->arch = 10; \
|
||||
bi_builder *B = bit_builder(mem_ctx); \
|
||||
B->shader->arch = 10; \
|
||||
{ \
|
||||
UNUSED bi_builder *b = A; \
|
||||
test; \
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2021 Collabora, Ltd.
|
||||
* Copyright (C) 2026 Arm Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
|
|
@ -9,9 +10,9 @@
|
|||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#define CASE(instr, expected) \
|
||||
#define CASE_ARCH(instr, arch, expected) \
|
||||
do { \
|
||||
uint64_t _value = va_pack_instr(instr, 10); \
|
||||
uint64_t _value = va_pack_instr(instr, arch); \
|
||||
if (_value != expected) { \
|
||||
fprintf(stderr, "Got %" PRIx64 ", expected %" PRIx64 "\n", _value, \
|
||||
(uint64_t)expected); \
|
||||
|
|
@ -45,124 +46,153 @@ class ValhallPacking : public testing::Test {
|
|||
|
||||
TEST_F(ValhallPacking, Moves)
|
||||
{
|
||||
CASE(bi_mov_i32_to(b, bi_register(1), bi_register(2)),
|
||||
0x0091c10000000002ULL);
|
||||
CASE(bi_mov_i32_to(b, bi_register(1),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false)),
|
||||
0x0091c1000000008aULL);
|
||||
bi_instr *I = bi_mov_i32_to(b, bi_register(1), bi_register(2));
|
||||
CASE_ARCH(I, 10, 0x0091c10000000002ULL);
|
||||
CASE_ARCH(I, 15, 0x0060010000200002ULL);
|
||||
|
||||
I = bi_mov_i32_to(b, bi_register(1),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 5), false));
|
||||
CASE_ARCH(I, 10, 0x0091c1000000008aULL);
|
||||
CASE_ARCH(I, 15, 0x006101000020000aULL);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Fadd)
|
||||
{
|
||||
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2)),
|
||||
0x00a4c00000000201ULL);
|
||||
CASE(
|
||||
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2))),
|
||||
0x00a4c02000000201ULL);
|
||||
CASE(
|
||||
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2))),
|
||||
0x00a4c01000000201ULL);
|
||||
bi_instr *I =
|
||||
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_register(2));
|
||||
CASE_ARCH(I, 10, 0x00a4c00000000201ULL);
|
||||
CASE_ARCH(I, 15, 0x00f0000000000201ULL);
|
||||
|
||||
CASE(bi_fadd_v2f16_to(b, bi_register(0),
|
||||
bi_swz_16(bi_register(1), false, false),
|
||||
bi_swz_16(bi_register(0), true, true)),
|
||||
0x00a5c0000c000001ULL);
|
||||
I =
|
||||
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_abs(bi_register(2)));
|
||||
CASE_ARCH(I, 10, 0x00a4c02000000201ULL);
|
||||
CASE_ARCH(I, 15, 0x00f0002000000201ULL);
|
||||
|
||||
CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0)),
|
||||
0x00a5c00028000001ULL);
|
||||
I =
|
||||
bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(bi_register(2)));
|
||||
CASE_ARCH(I, 10, 0x00a4c01000000201ULL);
|
||||
CASE_ARCH(I, 15, 0x00f0001000000201ULL);
|
||||
|
||||
CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_register(1),
|
||||
bi_swz_16(bi_register(0), true, false)),
|
||||
0x00a5c00024000001ULL);
|
||||
I = bi_fadd_v2f16_to(b, bi_register(0),
|
||||
bi_swz_16(bi_register(1), false, false),
|
||||
bi_swz_16(bi_register(0), true, true));
|
||||
CASE_ARCH(I, 10, 0x00a5c0000c000001ULL);
|
||||
CASE_ARCH(I, 15, 0x00f400000c000001ULL);
|
||||
|
||||
CASE(bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))),
|
||||
bi_neg(zero)),
|
||||
0x00a5c0902800c040ULL);
|
||||
I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1), bi_register(0));
|
||||
CASE_ARCH(I, 10, 0x00a5c00028000001ULL);
|
||||
CASE_ARCH(I, 15, 0x00f4000028000001ULL);
|
||||
|
||||
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero),
|
||||
0x00a4c0000000c001ULL);
|
||||
I = bi_fadd_v2f16_to(b, bi_register(0), bi_register(1),
|
||||
bi_swz_16(bi_register(0), true, false));
|
||||
CASE_ARCH(I, 10, 0x00a5c00024000001ULL);
|
||||
CASE_ARCH(I, 15, 0x00f4000024000001ULL);
|
||||
|
||||
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero)),
|
||||
0x00a4c0100000c001ULL);
|
||||
I = bi_fadd_v2f16_to(b, bi_register(0), bi_discard(bi_abs(bi_register(0))),
|
||||
bi_neg(zero));
|
||||
CASE_ARCH(I, 10, 0x00a5c0902800c040ULL);
|
||||
CASE_ARCH(I, 15, 0x00f600902800c080ULL);
|
||||
|
||||
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
|
||||
bi_half(bi_register(0), true)),
|
||||
0x00a4c00008000001ULL);
|
||||
I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), zero);
|
||||
CASE_ARCH(I, 10, 0x00a4c0000000c001ULL);
|
||||
CASE_ARCH(I, 15, 0x00f200000000c001ULL);
|
||||
|
||||
CASE(bi_fadd_f32_to(b, bi_register(0), bi_register(1),
|
||||
bi_half(bi_register(0), false)),
|
||||
0x00a4c00004000001ULL);
|
||||
I = bi_fadd_f32_to(b, bi_register(0), bi_register(1), bi_neg(zero));
|
||||
CASE_ARCH(I, 10, 0x00a4c0100000c001ULL);
|
||||
CASE_ARCH(I, 15, 0x00f200100000c001ULL);
|
||||
|
||||
I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
|
||||
bi_half(bi_register(0), true));
|
||||
CASE_ARCH(I, 10, 0x00a4c00008000001ULL);
|
||||
CASE_ARCH(I, 15, 0x00f0000008000001ULL);
|
||||
|
||||
I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
|
||||
bi_half(bi_register(0), false));
|
||||
CASE_ARCH(I, 10, 0x00a4c00004000001ULL);
|
||||
CASE_ARCH(I, 15, 0x00f0000004000001ULL);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Clper)
|
||||
{
|
||||
CASE(bi_clper_i32_to(b, bi_register(0), bi_register(0), bi_byte(n4567, 0),
|
||||
BI_INACTIVE_RESULT_F1, BI_LANE_OP_NONE,
|
||||
BI_SUBGROUP_SUBGROUP16),
|
||||
0x00a0c030128fc900);
|
||||
bi_instr *I = bi_clper_i32_to(b, bi_register(0), bi_register(0),
|
||||
bi_byte(n4567, 0), BI_INACTIVE_RESULT_F1,
|
||||
BI_LANE_OP_NONE, BI_SUBGROUP_SUBGROUP16);
|
||||
CASE_ARCH(I, 10, 0x00a0c030128fc900);
|
||||
CASE_ARCH(I, 15, 0x00e20030028fc900);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Clamps)
|
||||
{
|
||||
bi_instr *I = bi_fadd_f32_to(b, bi_register(0), bi_register(1),
|
||||
bi_neg(bi_abs(bi_register(2))));
|
||||
CASE(I, 0x00a4c03000000201ULL);
|
||||
CASE_ARCH(I, 10, 0x00a4c03000000201ULL);
|
||||
CASE_ARCH(I, 15, 0x00f0003000000201ULL);
|
||||
|
||||
I->clamp = BI_CLAMP_CLAMP_M1_1;
|
||||
CASE(I, 0x00a4c03200000201ULL);
|
||||
CASE_ARCH(I, 10, 0x00a4c03200000201ULL);
|
||||
CASE_ARCH(I, 15, 0x00f0003080000201ULL);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Misc)
|
||||
{
|
||||
CASE(bi_fma_f32_to(b, bi_register(1), bi_discard(bi_register(1)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false),
|
||||
bi_neg(zero)),
|
||||
0x00b2c10400c08841ULL);
|
||||
bi_instr *I = bi_fma_f32_to(
|
||||
b, bi_register(1), bi_discard(bi_register(1)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 4), false), bi_neg(zero));
|
||||
CASE_ARCH(I, 10, 0x00b2c10400c08841ULL);
|
||||
CASE_ARCH(I, 15, 0x0166010400c00881ULL);
|
||||
|
||||
CASE(bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))),
|
||||
BI_ROUND_RTN),
|
||||
0x0090c240800d0042ULL);
|
||||
I = bi_fround_f32_to(b, bi_register(2), bi_discard(bi_neg(bi_register(2))),
|
||||
BI_ROUND_RTN);
|
||||
CASE_ARCH(I, 10, 0x0090c240800d0042ULL);
|
||||
CASE_ARCH(I, 15, 0x00600242004d0082ULL);
|
||||
|
||||
CASE(bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0),
|
||||
BI_ROUND_RTN),
|
||||
0x00904000a00f0000ULL);
|
||||
I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false), bi_register(0),
|
||||
BI_ROUND_RTN);
|
||||
CASE_ARCH(I, 10, 0x00904000a00f0000ULL);
|
||||
/* Removed on v11 */
|
||||
|
||||
CASE(
|
||||
bi_fround_v2f16_to(b, bi_half(bi_register(0), false),
|
||||
bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN),
|
||||
0x00904000900f0001ULL);
|
||||
I = bi_fround_v2f16_to(b, bi_half(bi_register(0), false),
|
||||
bi_swz_16(bi_register(1), true, false), BI_ROUND_RTN);
|
||||
CASE_ARCH(I, 10, 0x00904000900f0001ULL);
|
||||
/* Removed on v11 */
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, FaddImm)
|
||||
{
|
||||
CASE(bi_fadd_imm_f32_to(b, bi_register(2), bi_discard(bi_register(2)),
|
||||
0x4847C6C0),
|
||||
0x0114C24847C6C042ULL);
|
||||
bi_instr *I = bi_fadd_imm_f32_to(b, bi_register(2),
|
||||
bi_discard(bi_register(2)), 0x4847C6C0);
|
||||
CASE_ARCH(I, 10, 0x0114C24847C6C042ULL);
|
||||
CASE_ARCH(I, 15, 0x0064024847c6c082ULL);
|
||||
|
||||
CASE(bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)),
|
||||
0x70AC6784),
|
||||
0x0115C270AC678442ULL);
|
||||
I = bi_fadd_imm_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)),
|
||||
0x70AC6784);
|
||||
CASE_ARCH(I, 10, 0x0115C270AC678442ULL);
|
||||
CASE_ARCH(I, 15, 0x00620270ac678482ULL);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Comparions)
|
||||
{
|
||||
CASE(bi_icmp_or_v2s16_to(b, bi_register(2),
|
||||
bi_discard(bi_swz_16(bi_register(3), true, false)),
|
||||
bi_discard(bi_swz_16(bi_register(2), true, false)),
|
||||
zero, BI_CMPF_GT, BI_RESULT_TYPE_M1),
|
||||
0x00f9c21184c04243);
|
||||
bi_instr *I = bi_icmp_or_v2s16_to(
|
||||
b, bi_register(2), bi_discard(bi_swz_16(bi_register(3), true, false)),
|
||||
bi_discard(bi_swz_16(bi_register(2), true, false)), zero, BI_CMPF_GT,
|
||||
BI_RESULT_TYPE_M1);
|
||||
CASE_ARCH(I, 10, 0x00f9c21184c04243);
|
||||
CASE_ARCH(I, 15, 0x01e40212c6c08283);
|
||||
|
||||
CASE(bi_fcmp_or_v2f16_to(b, bi_register(2),
|
||||
bi_discard(bi_swz_16(bi_register(3), true, false)),
|
||||
bi_discard(bi_swz_16(bi_register(2), false, false)),
|
||||
zero, BI_CMPF_GT, BI_RESULT_TYPE_M1),
|
||||
0x00f5c20190c04243);
|
||||
I = bi_fcmp_or_v2f16_to(b, bi_register(2),
|
||||
bi_discard(bi_swz_16(bi_register(3), true, false)),
|
||||
bi_discard(bi_swz_16(bi_register(2), false, false)),
|
||||
zero, BI_CMPF_GT, BI_RESULT_TYPE_M1);
|
||||
CASE_ARCH(I, 10, 0x00f5c20190c04243);
|
||||
CASE_ARCH(I, 15, 0x01e4020352c08283);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Conversions)
|
||||
{
|
||||
CASE(bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2))),
|
||||
0x0090c22000070042);
|
||||
bi_instr *I =
|
||||
bi_v2s16_to_v2f16_to(b, bi_register(2), bi_discard(bi_register(2)));
|
||||
CASE_ARCH(I, 10, 0x0090c22000070042);
|
||||
/* Removed on v11 */
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, BranchzI16)
|
||||
|
|
@ -170,88 +200,105 @@ TEST_F(ValhallPacking, BranchzI16)
|
|||
bi_instr *I =
|
||||
bi_branchz_i16(b, bi_half(bi_register(2), false), bi_null(), BI_CMPF_EQ);
|
||||
I->branch_offset = 1;
|
||||
CASE(I, 0x001fc03000000102);
|
||||
CASE_ARCH(I, 10, 0x001fc03000000102);
|
||||
CASE_ARCH(I, 15, 0x02b8003000000102);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, BranchzI16Backwards)
|
||||
{
|
||||
bi_instr *I = bi_branchz_i16(b, zero, bi_null(), BI_CMPF_EQ);
|
||||
I->branch_offset = -8;
|
||||
CASE(I, 0x001fc017fffff8c0);
|
||||
CASE_ARCH(I, 10, 0x001fc017fffff8c0);
|
||||
CASE_ARCH(I, 15, 0x02b90017fffff8c0);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Blend)
|
||||
{
|
||||
CASE(
|
||||
bi_instr *I =
|
||||
bi_blend_to(b, bi_null(), bi_register(0), bi_register(60),
|
||||
bi_fau(BIR_FAU_BLEND_0, false), bi_fau(BIR_FAU_BLEND_0, true),
|
||||
bi_null(), BI_REGISTER_FORMAT_F16, 2, 0),
|
||||
0x007f4004333c00f0);
|
||||
bi_null(), BI_REGISTER_FORMAT_F16, 2, 0);
|
||||
CASE_ARCH(I, 10, 0x007f4004333c00f0);
|
||||
CASE_ARCH(I, 15, 0x031b0082333c00f0);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Mux)
|
||||
{
|
||||
CASE(bi_mux_i32_to(b, bi_register(0), bi_discard(bi_register(0)),
|
||||
bi_discard(bi_register(4)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false),
|
||||
BI_MUX_BIT),
|
||||
0x00b8c00300804440ull);
|
||||
bi_instr *I = bi_mux_i32_to(
|
||||
b, bi_register(0), bi_discard(bi_register(0)), bi_discard(bi_register(4)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 0), false), BI_MUX_BIT);
|
||||
CASE_ARCH(I, 10, 0x00b8c00300804440ull);
|
||||
CASE_ARCH(I, 15, 0x017c000c80008480ull);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, AtestFP16)
|
||||
{
|
||||
CASE(bi_atest_to(b, bi_register(60), bi_register(60),
|
||||
bi_half(bi_register(1), true),
|
||||
bi_fau(BIR_FAU_ATEST_PARAM, false)),
|
||||
0x007dbc0208ea013c);
|
||||
bi_instr *I = bi_atest_to(b, bi_register(60), bi_register(60),
|
||||
bi_half(bi_register(1), true),
|
||||
bi_fau(BIR_FAU_ATEST_PARAM, false));
|
||||
CASE_ARCH(I, 10, 0x007dbc0208ea013c);
|
||||
CASE_ARCH(I, 15, 0x03d43c0108ea013c);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, AtestFP32)
|
||||
{
|
||||
CASE(bi_atest_to(b, bi_register(60), bi_register(60), one,
|
||||
bi_fau(BIR_FAU_ATEST_PARAM, false)),
|
||||
0x007dbc0200ead03c);
|
||||
bi_instr *I = bi_atest_to(b, bi_register(60), bi_register(60), one,
|
||||
bi_fau(BIR_FAU_ATEST_PARAM, false));
|
||||
CASE_ARCH(I, 10, 0x007dbc0200ead03c);
|
||||
CASE_ARCH(I, 15, 0x03d63c0100ead03c);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Transcendentals)
|
||||
{
|
||||
CASE(bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true),
|
||||
0x0099c10001000000);
|
||||
bi_instr *I =
|
||||
bi_frexpm_f32_to(b, bi_register(1), bi_register(0), false, true);
|
||||
CASE_ARCH(I, 10, 0x0099c10001000000);
|
||||
CASE_ARCH(I, 15, 0x0060010041200000);
|
||||
|
||||
CASE(bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false,
|
||||
true),
|
||||
0x0099c00001020040);
|
||||
I = bi_frexpe_f32_to(b, bi_register(0), bi_discard(bi_register(0)), false,
|
||||
true);
|
||||
CASE_ARCH(I, 10, 0x0099c00001020040);
|
||||
CASE_ARCH(I, 15, 0x0060000041220080);
|
||||
|
||||
CASE(bi_frsq_f32_to(b, bi_register(2), bi_register(1)), 0x009cc20000020001);
|
||||
I = bi_frsq_f32_to(b, bi_register(2), bi_register(1));
|
||||
CASE_ARCH(I, 10, 0x009cc20000020001);
|
||||
CASE_ARCH(I, 15, 0x0060020001820001);
|
||||
|
||||
CASE(bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)),
|
||||
bi_discard(bi_register(2)), bi_neg(zero),
|
||||
bi_discard(bi_register(0)), BI_SPECIAL_LEFT),
|
||||
0x0162c00440c04241);
|
||||
I = bi_fma_rscale_f32_to(b, bi_register(0), bi_discard(bi_register(1)),
|
||||
bi_discard(bi_register(2)), bi_neg(zero),
|
||||
bi_discard(bi_register(0)), BI_SPECIAL_LEFT);
|
||||
CASE_ARCH(I, 10, 0x0162c00440c04241);
|
||||
CASE_ARCH(I, 15, 0x0264000e80c08281);
|
||||
|
||||
I = bi_fma_rscale_f32_to(b, bi_register(0), bi_register(1), bi_register(2),
|
||||
bi_neg(zero), bi_discard(bi_register(0)),
|
||||
BI_SPECIAL_N);
|
||||
CASE_ARCH(I, 10, 0x0161c00440c00201);
|
||||
CASE_ARCH(I, 15, 0x0264000d80c00201);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Csel)
|
||||
{
|
||||
CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)),
|
||||
bi_discard(bi_register(3)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true),
|
||||
BI_CMPF_EQ),
|
||||
0x0150c10085844342);
|
||||
bi_instr *I = bi_csel_u32_to(
|
||||
b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_EQ);
|
||||
CASE_ARCH(I, 10, 0x0150c10085844342);
|
||||
CASE_ARCH(I, 15, 0x027c010005048382);
|
||||
|
||||
CASE(bi_csel_u32_to(b, bi_register(1), bi_discard(bi_register(2)),
|
||||
bi_discard(bi_register(3)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true),
|
||||
BI_CMPF_LT),
|
||||
0x0150c10485844342);
|
||||
I = bi_csel_u32_to(
|
||||
b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT);
|
||||
CASE_ARCH(I, 10, 0x0150c10485844342);
|
||||
CASE_ARCH(I, 15, 0x027c010805048382);
|
||||
|
||||
CASE(bi_csel_s32_to(b, bi_register(1), bi_discard(bi_register(2)),
|
||||
bi_discard(bi_register(3)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true),
|
||||
BI_CMPF_LT),
|
||||
0x0158c10485844342);
|
||||
I = bi_csel_s32_to(
|
||||
b, bi_register(1), bi_discard(bi_register(2)), bi_discard(bi_register(3)),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), false),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 2), true), BI_CMPF_LT);
|
||||
CASE_ARCH(I, 10, 0x0158c10485844342);
|
||||
CASE_ARCH(I, 15, 0x027c014805048382);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, LdAttrImm)
|
||||
|
|
@ -261,34 +308,67 @@ TEST_F(ValhallPacking, LdAttrImm)
|
|||
bi_discard(bi_register(61)), BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4, 1);
|
||||
I->table = 1;
|
||||
|
||||
CASE(I, 0x0066800433117d7c);
|
||||
CASE_ARCH(I, 10, 0x0066800433117d7c);
|
||||
CASE_ARCH(I, 15, 0x038400023311bdbc);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, LdVarBufImmF16)
|
||||
{
|
||||
CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(2), bi_register(61),
|
||||
BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTER,
|
||||
BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE,
|
||||
BI_VECSIZE_V4, 0),
|
||||
0x005d82143300003d);
|
||||
bi_instr *I = bi_ld_var_buf_imm_f16_to(
|
||||
b, bi_register(2), bi_register(61), BI_REGISTER_FORMAT_F16,
|
||||
BI_SAMPLE_CENTER, BI_SOURCE_FORMAT_F16, BI_UPDATE_RETRIEVE, BI_VECSIZE_V4,
|
||||
0);
|
||||
CASE_ARCH(I, 10, 0x005d82143300003d);
|
||||
CASE_ARCH(I, 15, 0x0310020a3f00003d);
|
||||
|
||||
CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
|
||||
BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE,
|
||||
BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
|
||||
BI_VECSIZE_V4, 0),
|
||||
0x005d80843300003d);
|
||||
I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
|
||||
BI_REGISTER_FORMAT_F16, BI_SAMPLE_SAMPLE,
|
||||
BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
|
||||
BI_VECSIZE_V4, 0);
|
||||
CASE_ARCH(I, 10, 0x005d80843300003d);
|
||||
CASE_ARCH(I, 15, 0x031000423f00003d);
|
||||
|
||||
CASE(bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
|
||||
BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID,
|
||||
BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
|
||||
BI_VECSIZE_V4, 8),
|
||||
0x005d80443308003d);
|
||||
I = bi_ld_var_buf_imm_f16_to(b, bi_register(0), bi_register(61),
|
||||
BI_REGISTER_FORMAT_F16, BI_SAMPLE_CENTROID,
|
||||
BI_SOURCE_FORMAT_F16, BI_UPDATE_STORE,
|
||||
BI_VECSIZE_V4, 8);
|
||||
CASE_ARCH(I, 10, 0x005d80443308003d);
|
||||
CASE_ARCH(I, 11, 0x005d80443300083d);
|
||||
CASE_ARCH(I, 15, 0x031000223f00083d);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, LdVarBufFlatImmFormat)
|
||||
{
|
||||
bi_instr *I = bi_ld_var_buf_flat_imm_to(
|
||||
b, bi_register(0), BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4, 0x12);
|
||||
CASE_ARCH(I, 14, 0x0040800832001200);
|
||||
CASE_ARCH(I, 15, 0x033900043a0012c0);
|
||||
|
||||
I = bi_ld_var_buf_flat_imm_to(b, bi_register(0), BI_REGISTER_FORMAT_F16,
|
||||
BI_VECSIZE_V4, 0x12);
|
||||
CASE_ARCH(I, 14, 0x0040800433001200);
|
||||
CASE_ARCH(I, 15, 0x033900023b0012c0);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, LdVarBufFlat)
|
||||
{
|
||||
bi_instr *I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61),
|
||||
BI_REGISTER_FORMAT_F32, BI_VECSIZE_V4);
|
||||
CASE_ARCH(I, 14, 0x005f80083200003d);
|
||||
CASE_ARCH(I, 15, 0x031400043a00003d);
|
||||
|
||||
I = bi_ld_var_buf_flat_to(b, bi_register(0), bi_register(61),
|
||||
BI_REGISTER_FORMAT_F16, BI_VECSIZE_V4);
|
||||
CASE_ARCH(I, 14, 0x005f80043300003d);
|
||||
CASE_ARCH(I, 15, 0x031400023b00003d);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, LeaBufImm)
|
||||
{
|
||||
CASE(bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59))),
|
||||
0x005e84040000007b);
|
||||
bi_instr *I =
|
||||
bi_lea_buf_imm_to(b, bi_register(4), bi_discard(bi_register(59)));
|
||||
CASE_ARCH(I, 10, 0x005e84040000007b);
|
||||
CASE_ARCH(I, 15, 0x03080402000000bb);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, StoreMemoryAccess)
|
||||
|
|
@ -296,61 +376,94 @@ TEST_F(ValhallPacking, StoreMemoryAccess)
|
|||
bi_instr *I = bi_store_i96(b, bi_register(0), bi_discard(bi_register(4)),
|
||||
bi_discard(bi_register(5)), BI_SEG_NONE, 0);
|
||||
I->mem_access = VA_MEMORY_ACCESS_ESTREAM;
|
||||
CASE(I, 0x0061400632000044);
|
||||
CASE_ARCH(I, 10, 0x0061400632000044);
|
||||
CASE_ARCH(I, 15, 0x0320009302000084);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Convert16To32)
|
||||
{
|
||||
CASE(bi_u16_to_u32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), false))),
|
||||
0x0090c20000140077);
|
||||
bi_instr *I = bi_u16_to_u32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), false)));
|
||||
CASE_ARCH(I, 10, 0x0090c20000140077);
|
||||
CASE_ARCH(I, 15, 0x00600200005400b7);
|
||||
|
||||
CASE(bi_u16_to_u32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), true))),
|
||||
0x0090c20010140077);
|
||||
I = bi_u16_to_u32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), true)));
|
||||
CASE_ARCH(I, 10, 0x0090c20010140077);
|
||||
CASE_ARCH(I, 15, 0x00600200105400b7);
|
||||
|
||||
CASE(bi_u16_to_f32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), false))),
|
||||
0x0090c20000150077);
|
||||
I = bi_u16_to_f32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), false)));
|
||||
CASE_ARCH(I, 10, 0x0090c20000150077);
|
||||
/* Removed on v11 */
|
||||
|
||||
CASE(bi_u16_to_f32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), true))),
|
||||
0x0090c20010150077);
|
||||
I = bi_u16_to_f32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), true)));
|
||||
CASE_ARCH(I, 10, 0x0090c20010150077);
|
||||
/* Removed on v11 */
|
||||
|
||||
CASE(bi_s16_to_s32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), false))),
|
||||
0x0090c20000040077);
|
||||
I = bi_s16_to_s32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), false)));
|
||||
CASE_ARCH(I, 10, 0x0090c20000040077);
|
||||
CASE_ARCH(I, 15, 0x00600200004400b7);
|
||||
|
||||
CASE(bi_s16_to_s32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), true))),
|
||||
0x0090c20010040077);
|
||||
I = bi_s16_to_s32_to(b, bi_register(2),
|
||||
bi_discard(bi_half(bi_register(55), true)));
|
||||
CASE_ARCH(I, 10, 0x0090c20010040077);
|
||||
CASE_ARCH(I, 15, 0x00600200104400b7);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Swizzle8)
|
||||
{
|
||||
CASE(bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero,
|
||||
zero, BI_CMPF_NE, BI_RESULT_TYPE_I1),
|
||||
0x00f2c14300c0c000);
|
||||
bi_instr *I =
|
||||
bi_icmp_or_v4u8_to(b, bi_register(1), bi_byte(bi_register(0), 0), zero,
|
||||
zero, BI_CMPF_NE, BI_RESULT_TYPE_I1);
|
||||
CASE_ARCH(I, 10, 0x00f2c14300c0c000);
|
||||
/* Removed on v11 */
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, FauPage1)
|
||||
{
|
||||
CASE(bi_mov_i32_to(b, bi_register(1),
|
||||
bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false)),
|
||||
0x0291c10000000080ULL);
|
||||
bi_instr *I = bi_mov_i32_to(
|
||||
b, bi_register(1), bi_fau((enum bir_fau)(BIR_FAU_UNIFORM | 32), false));
|
||||
CASE_ARCH(I, 10, 0x0291c10000000080ULL);
|
||||
CASE_ARCH(I, 15, 0x0061010000200040ULL);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, LdTileV3F16)
|
||||
{
|
||||
CASE(bi_ld_tile_to(b, bi_register(4), bi_discard(bi_register(0)),
|
||||
bi_register(60), bi_register(3), BI_REGISTER_FORMAT_F16,
|
||||
BI_VECSIZE_V3),
|
||||
0x0078840423033c40);
|
||||
bi_instr *I = bi_ld_tile_to(b, bi_register(4), bi_discard(bi_register(0)),
|
||||
bi_register(60), bi_register(3),
|
||||
BI_REGISTER_FORMAT_F16, BI_VECSIZE_V3);
|
||||
CASE_ARCH(I, 10, 0x0078840423033c40);
|
||||
CASE_ARCH(I, 15, 0x03c0040223033c80);
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Rhadd8)
|
||||
{
|
||||
CASE(bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)),
|
||||
bi_discard(bi_register(0)), BI_ROUND_RTP),
|
||||
0x00aac000400b4041);
|
||||
bi_instr *I = bi_hadd_v4s8_to(b, bi_register(0), bi_discard(bi_register(1)),
|
||||
bi_discard(bi_register(0)), BI_ROUND_RTP);
|
||||
CASE_ARCH(I, 10, 0x00aac000400b4041);
|
||||
/* Removed on v11 */
|
||||
}
|
||||
|
||||
TEST_F(ValhallPacking, Atomics)
|
||||
{
|
||||
|
||||
bi_instr *I =
|
||||
bi_atom1_return_i64_to(b, bi_register(0), bi_discard(bi_register(2)),
|
||||
bi_register(3), BI_ATOM_OPC_AINC, 2);
|
||||
CASE_ARCH(I, 10, 0x0069800428000042);
|
||||
CASE_ARCH(I, 15, 0x0328000220000082);
|
||||
|
||||
I = bi_atom_return_i32_to(b, bi_register(0), bi_discard(bi_register(1)),
|
||||
bi_register(2), bi_register(3), BI_ATOM_OPC_AXCHG,
|
||||
1);
|
||||
CASE_ARCH(I, 10, 0x0120c1021bc00002);
|
||||
CASE_ARCH(I, 15, 0x032401c10f000002);
|
||||
|
||||
I = bi_atom_return_i64_to(b, bi_register(0), bi_register(2), bi_register(6),
|
||||
bi_register(7), BI_ATOM_OPC_ACMPXCHG, 2);
|
||||
CASE_ARCH(I, 10, 0x0120c2182fc00006);
|
||||
CASE_ARCH(I, 15, 0x032802cc2f000006);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,9 +9,9 @@
|
|||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#define CASE(instr, expected) \
|
||||
#define CASE_ARCH(instr, arch, expected) \
|
||||
do { \
|
||||
if (va_validate_fau(instr) != expected) { \
|
||||
if (va_validate_fau(instr, arch) != expected) { \
|
||||
fprintf(stderr, "Incorrect validation for:\n"); \
|
||||
bi_print_instr(instr, stderr); \
|
||||
fprintf(stderr, "\n"); \
|
||||
|
|
@ -19,8 +19,8 @@
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
#define VALID(instr) CASE(instr, true)
|
||||
#define INVALID(instr) CASE(instr, false)
|
||||
#define VALID(instr) CASE_ARCH(instr, 10, true)
|
||||
#define INVALID(instr) CASE_ARCH(instr, 10, false)
|
||||
|
||||
class ValidateFau : public testing::Test {
|
||||
protected:
|
||||
|
|
|
|||
|
|
@ -13,9 +13,9 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
bool va_validate_fau(bi_instr *I);
|
||||
bool va_validate_fau(bi_instr *I, unsigned arch);
|
||||
void va_validate(FILE *fp, bi_context *ctx);
|
||||
void va_repair_fau(bi_builder *b, bi_instr *I);
|
||||
void va_repair_fau(bi_builder *b, bi_instr *I, unsigned arch);
|
||||
void va_fuse_add_imm(bi_instr *I);
|
||||
void va_lower_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts, uint32_t min_fau_count);
|
||||
void va_count_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts);
|
||||
|
|
@ -28,14 +28,15 @@ void va_gather_hsr_info(bi_context *ctx, struct pan_shader_info *info);
|
|||
uint64_t va_pack_instr(const bi_instr *I, unsigned arch);
|
||||
|
||||
static inline unsigned
|
||||
va_fau_page(enum bir_fau value)
|
||||
va_fau_page(enum bir_fau value, unsigned arch)
|
||||
{
|
||||
/* Uniform slots of FAU have a 7-bit index. The top 2-bits are the page; the
|
||||
* bottom 5-bits are specified in the source.
|
||||
*/
|
||||
if (value & BIR_FAU_UNIFORM) {
|
||||
unsigned value_shift = arch >= 15 ? 6 : 5;
|
||||
unsigned slot = value & ~BIR_FAU_UNIFORM;
|
||||
unsigned page = slot >> 5;
|
||||
unsigned page = slot >> value_shift;
|
||||
|
||||
assert(page <= 3);
|
||||
return page;
|
||||
|
|
@ -57,11 +58,11 @@ va_fau_page(enum bir_fau value)
|
|||
}
|
||||
|
||||
static inline unsigned
|
||||
va_select_fau_page(const bi_instr *I)
|
||||
va_select_fau_page(const bi_instr *I, unsigned arch)
|
||||
{
|
||||
bi_foreach_src(I, s) {
|
||||
if (I->src[s].type == BI_INDEX_FAU)
|
||||
return va_fau_page((enum bir_fau)I->src[s].value);
|
||||
return va_fau_page((enum bir_fau)I->src[s].value, arch);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
@ -77,7 +78,7 @@ struct va_stats {
|
|||
unsigned nr_fau_uniforms;
|
||||
};
|
||||
|
||||
void va_count_instr_stats(bi_instr *I, struct va_stats *stats);
|
||||
void va_count_instr_stats(bi_instr *I, unsigned arch, struct va_stats *stats);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern C */
|
||||
|
|
|
|||
|
|
@ -77,6 +77,8 @@ walk_bir_shader(bi_context *ctx, struct pan_shader_info *info)
|
|||
if (instr->sample == BI_SAMPLE_CENTROID)
|
||||
info->fs.hsr.centroid_interpolation = true;
|
||||
FALLTHROUGH;
|
||||
case BI_OPCODE_LD_VAR_BUF_FLAT:
|
||||
case BI_OPCODE_LD_VAR_BUF_FLAT_IMM:
|
||||
case BI_OPCODE_LD_VAR_FLAT:
|
||||
case BI_OPCODE_LD_VAR_FLAT_IMM:
|
||||
if (!found_atest)
|
||||
|
|
|
|||
|
|
@ -520,7 +520,7 @@ va_assign_slots(bi_context *ctx)
|
|||
|
||||
bi_foreach_instr_global(ctx, I) {
|
||||
if (I->op == BI_OPCODE_BARRIER) {
|
||||
I->slot = 7;
|
||||
I->slot = (ctx->arch >= 15) ? VA_SLOT_V15_SLOT7 : VA_SLOT_SLOT7;
|
||||
} else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) {
|
||||
I->slot = 0;
|
||||
} else if (bi_get_opcode_props(I)->message) {
|
||||
|
|
|
|||
|
|
@ -211,7 +211,7 @@ va_resolve_constant(bi_builder *b, uint32_t value, struct va_src_info info,
|
|||
static uint32_t
|
||||
va_resolve_swizzles(bi_context *ctx, bi_instr *I, unsigned s)
|
||||
{
|
||||
struct va_src_info info = va_src_info(I->op, s);
|
||||
struct va_src_info info = va_src_info(I->op, s, ctx->arch);
|
||||
uint32_t value = I->src[s].value;
|
||||
enum bi_swizzle swz = I->src[s].swizzle;
|
||||
|
||||
|
|
@ -257,9 +257,10 @@ va_lower_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts,
|
|||
/* abs(#c) is pointless, but -#c occurs in transcendental sequences */
|
||||
assert(!I->src[s].abs && "redundant .abs modifier");
|
||||
|
||||
bool is_signed = valhall_opcodes[I->op].is_signed;
|
||||
bool staging = (s < valhall_opcodes[I->op].nr_staging_srcs);
|
||||
struct va_src_info info = va_src_info(I->op, s);
|
||||
bool is_signed = get_valhall_opcode(I->op, ctx->arch).is_signed;
|
||||
bool staging =
|
||||
(s < get_valhall_opcode(I->op, ctx->arch).nr_staging_srcs);
|
||||
struct va_src_info info = va_src_info(I->op, s, ctx->arch);
|
||||
const uint32_t value = va_resolve_swizzles(ctx, I, s);
|
||||
|
||||
const uint32_t count = (uintptr_t)_mesa_hash_table_u64_search(counts, value);
|
||||
|
|
@ -294,12 +295,13 @@ va_count_constants(bi_context *ctx, bi_instr *I, struct hash_table_u64 *counts)
|
|||
if (I->src[s].type != BI_INDEX_CONSTANT)
|
||||
continue;
|
||||
|
||||
const bool staging = (s < valhall_opcodes[I->op].nr_staging_srcs);
|
||||
const bool staging =
|
||||
(s < get_valhall_opcode(I->op, ctx->arch).nr_staging_srcs);
|
||||
if (staging)
|
||||
continue;
|
||||
|
||||
bool is_signed = valhall_opcodes[I->op].is_signed;
|
||||
struct va_src_info info = va_src_info(I->op, s);
|
||||
bool is_signed = get_valhall_opcode(I->op, ctx->arch).is_signed;
|
||||
struct va_src_info info = va_src_info(I->op, s, ctx->arch);
|
||||
uint32_t value = va_resolve_swizzles(ctx, I, s);
|
||||
|
||||
bi_index cons = va_lookup_constant(value, info, is_signed);
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ va_lower_split_64bit(bi_context *ctx)
|
|||
if (bi_is_null(I->src[s]) || s >= 4)
|
||||
continue;
|
||||
|
||||
struct va_src_info info = va_src_info(I->op, s);
|
||||
struct va_src_info info = va_src_info(I->op, s, ctx->arch);
|
||||
|
||||
/* Only split if the instruction expects 64-bit inputs as two separate
|
||||
* sources. */
|
||||
|
|
|
|||
|
|
@ -179,7 +179,7 @@ va_mark_last(bi_context *ctx)
|
|||
break;
|
||||
|
||||
/* Only need to unmark split registers. */
|
||||
if (va_src_info(I->op, s).size == VA_SIZE_64 &&
|
||||
if (va_src_info(I->op, s, ctx->arch).size == VA_SIZE_64 &&
|
||||
bi_count_read_registers(I, s) == 1) {
|
||||
bool both_discard = I->src[s].discard && I->src[s + 1].discard;
|
||||
|
||||
|
|
|
|||
|
|
@ -286,7 +286,7 @@ va_fuse_cmp(bi_context *ctx, bi_instr **lut, const BITSET_WORD *multiple,
|
|||
static bool
|
||||
va_propagate_replicate_wide(bi_context *ctx, bi_instr **lut, bi_instr *I)
|
||||
{
|
||||
struct va_opcode_info info = valhall_opcodes[I->op];
|
||||
struct va_opcode_info info = get_valhall_opcode(I->op, ctx->arch);
|
||||
bool progress = false;
|
||||
|
||||
bi_foreach_ssa_src(I, s) {
|
||||
|
|
|
|||
|
|
@ -74,6 +74,15 @@ va_pack_reg(const bi_instr *I, bi_index idx)
|
|||
return idx.value;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
va_pack_reg_v15(const bi_instr *I, bi_index idx)
|
||||
{
|
||||
pack_assert(I, idx.type == BI_INDEX_REGISTER);
|
||||
pack_assert(I, idx.value < 128);
|
||||
|
||||
return idx.value;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
va_pack_fau_special(const bi_instr *I, enum bir_fau fau)
|
||||
{
|
||||
|
|
@ -124,6 +133,21 @@ va_pack_fau_64(const bi_instr *I, bi_index idx)
|
|||
return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1);
|
||||
}
|
||||
|
||||
static unsigned
|
||||
va_pack_fau_64_v15(const bi_instr *I, bi_index idx)
|
||||
{
|
||||
pack_assert(I, idx.type == BI_INDEX_FAU);
|
||||
|
||||
unsigned val = (idx.value & BITFIELD_MASK(6));
|
||||
|
||||
if (idx.value & BIR_FAU_IMMEDIATE)
|
||||
return (0x7 << 6) | (val << 1);
|
||||
else if (idx.value & BIR_FAU_UNIFORM)
|
||||
return (0x2 << 7) | (val << 1);
|
||||
else
|
||||
return (0xf << 5) | (va_pack_fau_special(I, idx.value) << 1);
|
||||
}
|
||||
|
||||
static unsigned
|
||||
va_pack_src(const bi_instr *I, unsigned s)
|
||||
{
|
||||
|
|
@ -142,6 +166,33 @@ va_pack_src(const bi_instr *I, unsigned s)
|
|||
invalid_instruction(I, "type of source %u", s);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
va_pack_src_v15(const bi_instr *I, unsigned s, unsigned loc)
|
||||
{
|
||||
bi_index idx = I->src[s];
|
||||
|
||||
uint64_t hex = 0;
|
||||
uint64_t regval = 0;
|
||||
|
||||
if (idx.type == BI_INDEX_REGISTER) {
|
||||
regval = va_pack_reg_v15(I, idx);
|
||||
if (idx.discard)
|
||||
regval |= (1 << 7);
|
||||
} else if (idx.type == BI_INDEX_FAU) {
|
||||
pack_assert(I, idx.offset <= 1);
|
||||
regval = va_pack_fau_64_v15(I, idx) | idx.offset;
|
||||
} else
|
||||
invalid_instruction(I, "type of source %u", s);
|
||||
|
||||
uint64_t low8 = regval & 0xff;
|
||||
uint64_t high1 = (regval >> 8) & 0x1;
|
||||
|
||||
hex |= (low8 << (8 * loc));
|
||||
hex |= (high1 << (48 + loc));
|
||||
|
||||
return hex;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
va_pack_wrmask(const bi_instr *I)
|
||||
{
|
||||
|
|
@ -211,6 +262,20 @@ va_pack_dest(const bi_instr *I)
|
|||
return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6);
|
||||
}
|
||||
|
||||
static unsigned
|
||||
va_pack_dest_v15(const bi_instr *I)
|
||||
{
|
||||
assert(I->nr_dests);
|
||||
switch (I->op) {
|
||||
case BI_OPCODE_SHADDX_S64:
|
||||
case BI_OPCODE_SHADDX_U64:
|
||||
/* 64 bit dest has a 0x0 wrmask */
|
||||
return va_pack_reg_v15(I, I->dest[0]);
|
||||
default:
|
||||
return va_pack_reg_v15(I, I->dest[0]) | (va_pack_wrmask(I) << 13);
|
||||
}
|
||||
}
|
||||
|
||||
static enum va_widen
|
||||
va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz)
|
||||
{
|
||||
|
|
@ -454,10 +519,22 @@ va_pack_rhadd(const bi_instr *I)
|
|||
}
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
va_pack_clamp_special_round_v15(const bi_instr *I)
|
||||
{
|
||||
pack_assert(I, I->special < 4);
|
||||
if (I->special == BI_SPECIAL_N && I->round == BI_ROUND_RTZ)
|
||||
return 0x4;
|
||||
else if (I->special)
|
||||
return 0x4 | I->special;
|
||||
else
|
||||
return I->clamp;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
va_pack_alu(const bi_instr *I, unsigned arch)
|
||||
{
|
||||
struct va_opcode_info info = valhall_opcodes[I->op];
|
||||
struct va_opcode_info info = get_valhall_opcode(I->op, arch);
|
||||
uint64_t hex = 0;
|
||||
|
||||
switch (I->op) {
|
||||
|
|
@ -467,25 +544,25 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
case BI_OPCODE_FREXPM_F32:
|
||||
case BI_OPCODE_FREXPM_V2F16:
|
||||
if (I->sqrt)
|
||||
hex |= 1ull << 24;
|
||||
hex |= 1ull << ((arch >= 15) ? 30 : 24);
|
||||
if (I->log)
|
||||
hex |= 1ull << 25;
|
||||
hex |= 1ull << ((arch >= 15) ? 31 : 25);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_FLUSH_F32:
|
||||
case BI_OPCODE_FLUSH_V2F16:
|
||||
hex |= I->nan_mode << 8;
|
||||
hex |= I->nan_mode << ((arch >= 15) ? 30 : 8);
|
||||
if (I->ftz)
|
||||
hex |= 1ull << 10;
|
||||
hex |= 1ull << ((arch >= 15) ? 32 : 10);
|
||||
if (I->flush_inf)
|
||||
hex |= 1ull << 11;
|
||||
hex |= 1ull << ((arch >= 15) ? 33 : 11);
|
||||
break;
|
||||
|
||||
/* Add mux type */
|
||||
case BI_OPCODE_MUX_I32:
|
||||
case BI_OPCODE_MUX_V2I16:
|
||||
case BI_OPCODE_MUX_V4I8:
|
||||
hex |= (uint64_t)I->mux << 32;
|
||||
hex |= (uint64_t)I->mux << ((arch >= 15) ? 34 : 32);
|
||||
break;
|
||||
|
||||
/* Add .eq flag */
|
||||
|
|
@ -497,7 +574,7 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
hex |= (1ull << 36);
|
||||
|
||||
if (I->op == BI_OPCODE_BRANCHZI)
|
||||
hex |= (0x1ull << 40); /* Absolute */
|
||||
hex |= (0x1ull << ((arch >= 15) ? 31 : 40)); /* Absolute */
|
||||
else
|
||||
hex |= ((uint64_t)I->branch_offset & BITFIELD_MASK(27)) << 8;
|
||||
|
||||
|
|
@ -513,7 +590,46 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
case BI_OPCODE_RSHIFT_XOR_I32:
|
||||
case BI_OPCODE_RSHIFT_XOR_V2I16:
|
||||
case BI_OPCODE_RSHIFT_XOR_V4I8:
|
||||
hex |= (uint64_t)I->arithmetic << 34;
|
||||
if (arch >= 15) {
|
||||
/* Rewrite exact to ARSHIFT */
|
||||
if (I->arithmetic) {
|
||||
switch (I->op) {
|
||||
case BI_OPCODE_RSHIFT_AND_I32:
|
||||
case BI_OPCODE_RSHIFT_AND_V2I16:
|
||||
case BI_OPCODE_RSHIFT_AND_V4I8: {
|
||||
uint64_t arshift_and_op = (0xcULL << 30);
|
||||
/* Check that we can safely overwrite opcode */
|
||||
pack_assert(I, ((info.exact & (0xfULL << 30)) |
|
||||
arshift_and_op) == arshift_and_op);
|
||||
hex |= arshift_and_op;
|
||||
break;
|
||||
}
|
||||
case BI_OPCODE_RSHIFT_OR_I32:
|
||||
case BI_OPCODE_RSHIFT_OR_V2I16:
|
||||
case BI_OPCODE_RSHIFT_OR_V4I8: {
|
||||
uint64_t arshift_or_op = (0xdULL << 30);
|
||||
/* Check that we can safely overwrite opcode */
|
||||
pack_assert(I, ((info.exact & (0xfULL << 30)) | arshift_or_op) ==
|
||||
arshift_or_op);
|
||||
hex |= arshift_or_op;
|
||||
break;
|
||||
}
|
||||
case BI_OPCODE_RSHIFT_XOR_I32:
|
||||
case BI_OPCODE_RSHIFT_XOR_V2I16:
|
||||
case BI_OPCODE_RSHIFT_XOR_V4I8: {
|
||||
uint64_t arshift_xor_op = (0xbULL << 30);
|
||||
/* Check that we can safely overwrite opcode */
|
||||
pack_assert(I, ((info.exact & (0xfULL << 30)) |
|
||||
arshift_xor_op) == arshift_xor_op);
|
||||
hex |= arshift_xor_op;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
UNREACHABLE("RSHIFT->ARSHIFT");
|
||||
}
|
||||
}
|
||||
} else
|
||||
hex |= (uint64_t)I->arithmetic << 34;
|
||||
break;
|
||||
|
||||
case BI_OPCODE_LEA_BUF_IMM:
|
||||
|
|
@ -564,8 +680,12 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
}
|
||||
|
||||
hex |= ((uint64_t)va_pack_source_format(I)) << 24;
|
||||
hex |= ((uint64_t)I->update) << 36;
|
||||
hex |= ((uint64_t)I->sample) << 38;
|
||||
hex |= ((uint64_t)I->update) << ((arch >= 15) ? 35 : 36);
|
||||
hex |= ((uint64_t)I->sample) << ((arch >= 15) ? 37 : 38);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_LD_VAR_BUF_FLAT_IMM:
|
||||
hex |= ((uint64_t)I->index) << 8;
|
||||
break;
|
||||
|
||||
case BI_OPCODE_LD_ATTR_IMM:
|
||||
|
|
@ -599,20 +719,18 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
break;
|
||||
}
|
||||
|
||||
/* FMA_RSCALE.f32 special modes treated as extra opcodes */
|
||||
if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
|
||||
pack_assert(I, I->special < 4);
|
||||
hex |= ((uint64_t)I->special) << 48;
|
||||
}
|
||||
|
||||
/* Add the normal destination or a placeholder. Staging destinations are
|
||||
* added elsewhere, as they require special handling for control fields.
|
||||
*/
|
||||
if (info.has_dest && info.nr_staging_dests == 0) {
|
||||
hex |= (uint64_t)va_pack_dest(I) << 40;
|
||||
if (arch >= 15)
|
||||
hex |= (uint64_t)va_pack_dest_v15(I) << 40;
|
||||
else
|
||||
hex |= (uint64_t)va_pack_dest(I) << 40;
|
||||
} else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) {
|
||||
pack_assert(I, I->nr_dests == 0);
|
||||
hex |= 0xC0ull << 40; /* Placeholder */
|
||||
if (arch < 15)
|
||||
hex |= 0xC0ull << 40; /* Placeholder */
|
||||
}
|
||||
|
||||
bool swap12 = va_swap_12(I->op);
|
||||
|
|
@ -627,7 +745,10 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
enum va_size size = src_info.size;
|
||||
|
||||
bi_index src = I->src[logical_i + src_offset];
|
||||
hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, logical_i + src_offset, i);
|
||||
else
|
||||
hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);
|
||||
|
||||
if (src_info.notted) {
|
||||
if (src.neg)
|
||||
|
|
@ -636,10 +757,15 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
unsigned neg_offs = 32 + 2 + ((2 - i) * 2);
|
||||
unsigned abs_offs = 33 + 2 + ((2 - i) * 2);
|
||||
|
||||
if (src.neg)
|
||||
hex |= 1ull << neg_offs;
|
||||
if (src.abs)
|
||||
hex |= 1ull << abs_offs;
|
||||
if (arch >= 15 && I->op == BI_OPCODE_FMA_RSCALE_F32 && i == 2) {
|
||||
if (src.neg)
|
||||
hex |= 1ull << (neg_offs + 1);
|
||||
} else {
|
||||
if (src.neg)
|
||||
hex |= 1ull << neg_offs;
|
||||
if (src.abs)
|
||||
hex |= 1ull << abs_offs;
|
||||
}
|
||||
} else {
|
||||
if (src.neg)
|
||||
invalid_instruction(I, "negate");
|
||||
|
|
@ -659,8 +785,8 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
unsigned offs = (i == 1) ? 26 : 36;
|
||||
hex |= (uint64_t)va_pack_widen(I, src.swizzle, src_info.size) << offs;
|
||||
} else if (src_info.lane) {
|
||||
unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ?
|
||||
((i == 0) ? 38 : 36) : ((i == 0) ? 28 : 26);
|
||||
unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ? ((i == 0) ? 38 : 36)
|
||||
: ((i == 0) ? 28 : 26);
|
||||
|
||||
if (src_info.size == VA_SIZE_16) {
|
||||
hex |= (src.swizzle == BI_SWIZZLE_H1 ? 1 : 0) << offs;
|
||||
|
|
@ -673,7 +799,25 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
} else if (src_info.lanes) {
|
||||
pack_assert(I, src_info.size == VA_SIZE_8);
|
||||
pack_assert(I, i == 1);
|
||||
hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
|
||||
if (arch >= 15 && I->op == BI_OPCODE_CLPER_I32) {
|
||||
switch (src.swizzle) {
|
||||
case BI_SWIZZLE_B00:
|
||||
hex |= 0x0ULL << 28;
|
||||
break;
|
||||
case BI_SWIZZLE_B11:
|
||||
hex |= 0x1ULL << 28;
|
||||
break;
|
||||
case BI_SWIZZLE_B22:
|
||||
hex |= 0x2ULL << 28;
|
||||
break;
|
||||
case BI_SWIZZLE_B33:
|
||||
hex |= 0x3ULL << 28;
|
||||
break;
|
||||
default:
|
||||
invalid_instruction(I, "lane shift");
|
||||
}
|
||||
} else
|
||||
hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
|
||||
} else if (src_info.combine) {
|
||||
/* Treat as swizzle, subgroup ops not yet supported */
|
||||
pack_assert(I, src_info.size == VA_SIZE_32);
|
||||
|
|
@ -689,17 +833,33 @@ va_pack_alu(const bi_instr *I, unsigned arch)
|
|||
}
|
||||
|
||||
if (info.saturate)
|
||||
hex |= (uint64_t)I->saturate << 30;
|
||||
if (info.rhadd)
|
||||
hex |= (uint64_t)I->saturate << ((arch >= 15) ? 25 : 30);
|
||||
if (info.rhadd) {
|
||||
pack_assert(I, arch < 15);
|
||||
hex |= va_pack_rhadd(I);
|
||||
if (info.clamp)
|
||||
hex |= (uint64_t)I->clamp << 32;
|
||||
if (info.round_mode)
|
||||
hex |= (uint64_t)I->round << 30;
|
||||
}
|
||||
/* FMA_RSCALE.f32 special modes treated as extra opcodes */
|
||||
if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
|
||||
if (arch >= 15) {
|
||||
hex |= va_pack_clamp_special_round_v15(I) << 32;
|
||||
} else {
|
||||
pack_assert(I, I->special < 4);
|
||||
hex |= ((uint64_t)I->special) << 48;
|
||||
if (info.clamp)
|
||||
hex |= (uint64_t)I->clamp << 32;
|
||||
if (info.round_mode && I->round == BI_ROUND_RTZ)
|
||||
hex |= (uint64_t)0x1 << 50;
|
||||
}
|
||||
} else {
|
||||
if (info.clamp)
|
||||
hex |= (uint64_t)I->clamp << ((arch >= 15) ? 30 : 32);
|
||||
if (info.round_mode)
|
||||
hex |= (uint64_t)I->round << ((arch >= 15) ? 32 : 30);
|
||||
}
|
||||
if (info.condition)
|
||||
hex |= (uint64_t)I->cmpf << 32;
|
||||
hex |= (uint64_t)I->cmpf << ((arch >= 15) ? 33 : 32);
|
||||
if (info.result_type)
|
||||
hex |= (uint64_t)I->result_type << 30;
|
||||
hex |= (uint64_t)I->result_type << ((arch >= 15) ? 24 : 30);
|
||||
|
||||
return hex;
|
||||
}
|
||||
|
|
@ -748,7 +908,8 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor)
|
|||
VA_LOAD_LANE_96_BIT_IDENTITY, VA_LOAD_LANE_128_BIT_IDENTITY,
|
||||
};
|
||||
|
||||
unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7;
|
||||
/* TODO hack */
|
||||
unsigned memory_size = (get_valhall_opcode(I->op, 10).exact >> 27) & 0x7;
|
||||
uint64_t hex = (uint64_t)load_lane_identity[memory_size] << 36;
|
||||
|
||||
// unsigned
|
||||
|
|
@ -765,6 +926,26 @@ va_pack_load(const bi_instr *I, bool buffer_descriptor)
|
|||
|
||||
return hex;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
va_pack_load_v15(const bi_instr *I, bool buffer_descriptor)
|
||||
{
|
||||
/* This implicitly means identity: VA_LOAD_LANE_8_BIT_B0 for i8 (bits[28;27])
|
||||
* and VA_LOAD_LANE_16_BIT_H0 for i16 (bit[27]) */
|
||||
uint64_t hex = 0;
|
||||
|
||||
if (!buffer_descriptor)
|
||||
hex |= va_pack_byte_offset(I);
|
||||
|
||||
hex |= va_pack_src_v15(I, 0, 0);
|
||||
hex |= (uint64_t)I->mem_access << 24;
|
||||
|
||||
if (buffer_descriptor)
|
||||
hex |= va_pack_src_v15(I, 1, 1);
|
||||
|
||||
return hex;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
va_pack_store(const bi_instr *I)
|
||||
{
|
||||
|
|
@ -779,6 +960,20 @@ va_pack_store(const bi_instr *I)
|
|||
return hex;
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
va_pack_store_v15(const bi_instr *I)
|
||||
{
|
||||
uint64_t hex = 0;
|
||||
|
||||
va_validate_register_pair(I, 1);
|
||||
hex |= va_pack_src_v15(I, 1, 0);
|
||||
hex |= I->mem_access << 24;
|
||||
|
||||
hex |= va_pack_byte_offset(I);
|
||||
|
||||
return hex;
|
||||
}
|
||||
|
||||
static enum va_lod_mode
|
||||
va_pack_lod_mode(const bi_instr *I)
|
||||
{
|
||||
|
|
@ -798,27 +993,6 @@ va_pack_lod_mode(const bi_instr *I)
|
|||
invalid_instruction(I, "LOD mode");
|
||||
}
|
||||
|
||||
static enum va_register_type
|
||||
va_pack_register_type(const bi_instr *I)
|
||||
{
|
||||
switch (I->register_format) {
|
||||
case BI_REGISTER_FORMAT_F16:
|
||||
case BI_REGISTER_FORMAT_F32:
|
||||
return VA_REGISTER_TYPE_F;
|
||||
|
||||
case BI_REGISTER_FORMAT_U16:
|
||||
case BI_REGISTER_FORMAT_U32:
|
||||
return VA_REGISTER_TYPE_U;
|
||||
|
||||
case BI_REGISTER_FORMAT_S16:
|
||||
case BI_REGISTER_FORMAT_S32:
|
||||
return VA_REGISTER_TYPE_S;
|
||||
|
||||
default:
|
||||
invalid_instruction(I, "register type");
|
||||
}
|
||||
}
|
||||
|
||||
static enum va_register_format
|
||||
va_pack_register_format(const bi_instr *I)
|
||||
{
|
||||
|
|
@ -842,13 +1016,45 @@ va_pack_register_format(const bi_instr *I)
|
|||
}
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
va_pack_src_null_v15(unsigned loc)
|
||||
{
|
||||
uint64_t hex = 0;
|
||||
uint64_t regval = 0x1c0;
|
||||
|
||||
uint64_t low8 = regval & 0xff;
|
||||
uint64_t high1 = (regval >> 8) & 0x1;
|
||||
|
||||
hex |= (low8 << (8 * loc));
|
||||
hex |= (high1 << (48 + loc));
|
||||
|
||||
return hex;
|
||||
}
|
||||
|
||||
static unsigned
|
||||
va_repack_sr_control_v15(unsigned sr_control)
|
||||
{
|
||||
unsigned repacked = 0;
|
||||
bool read = sr_control & 0x1;
|
||||
bool write = sr_control & 0x2;
|
||||
|
||||
if (read) {
|
||||
repacked |= 0x2;
|
||||
if (write)
|
||||
repacked |= 0x1;
|
||||
}
|
||||
|
||||
return repacked;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
va_pack_instr(const bi_instr *I, unsigned arch)
|
||||
{
|
||||
struct va_opcode_info info = valhall_opcodes[I->op];
|
||||
struct va_opcode_info info = get_valhall_opcode(I->op, arch);
|
||||
|
||||
uint64_t hex = info.exact | (((uint64_t)I->flow) << 59);
|
||||
hex |= ((uint64_t)va_select_fau_page(I)) << 57;
|
||||
uint64_t hex =
|
||||
info.exact | (((uint64_t)I->flow) << ((arch >= 15) ? 58 : 59));
|
||||
hex |= ((uint64_t)va_select_fau_page(I, arch)) << ((arch >= 15) ? 62 : 57);
|
||||
|
||||
if (info.slot)
|
||||
hex |= ((uint64_t)I->slot << 30);
|
||||
|
|
@ -860,14 +1066,60 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
unsigned count =
|
||||
read ? bi_count_read_registers(I, 0) : bi_count_write_registers(I, 0);
|
||||
|
||||
hex |= ((uint64_t)count << 33);
|
||||
hex |= (uint64_t)va_pack_reg(I, sr) << 40;
|
||||
hex |= ((uint64_t)info.sr_control << 46);
|
||||
hex |= ((uint64_t)count << ((arch >= 15) ? 32 : 33));
|
||||
if (arch >= 15) {
|
||||
hex |= (uint64_t)va_pack_reg_v15(I, sr) << 40;
|
||||
hex |= ((uint64_t)va_repack_sr_control_v15(info.sr_control) << 38);
|
||||
} else {
|
||||
hex |= (uint64_t)va_pack_reg(I, sr) << 40;
|
||||
hex |= ((uint64_t)info.sr_control << 46);
|
||||
}
|
||||
}
|
||||
|
||||
/* On v15, some instructions require special sr_control values */
|
||||
if (arch >= 15) {
|
||||
switch (I->op) {
|
||||
case BI_OPCODE_BARRIER: {
|
||||
unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
|
||||
pack_assert(I, sr_control == 0x0 || sr_control == 0x2);
|
||||
hex |= (uint64_t)0x2 << 38;
|
||||
break;
|
||||
}
|
||||
case BI_OPCODE_ATOM1_RETURN_I32:
|
||||
case BI_OPCODE_ATOM1_RETURN_I64: {
|
||||
unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
|
||||
pack_assert(I, sr_control == 0x0);
|
||||
break;
|
||||
}
|
||||
case BI_OPCODE_ATOM_I32:
|
||||
case BI_OPCODE_ATOM_I64: {
|
||||
unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
|
||||
pack_assert(I, sr_control == 0x2);
|
||||
break;
|
||||
}
|
||||
case BI_OPCODE_ATOM_RETURN_I32:
|
||||
case BI_OPCODE_ATOM_RETURN_I64:
|
||||
case BI_OPCODE_AXCHG_I32:
|
||||
case BI_OPCODE_AXCHG_I64:
|
||||
case BI_OPCODE_ACMPXCHG_I32:
|
||||
case BI_OPCODE_ACMPXCHG_I64: {
|
||||
unsigned sr_control = va_repack_sr_control_v15(info.sr_control);
|
||||
pack_assert(I, sr_control == 0x0 || sr_control == 0x3);
|
||||
hex |= (uint64_t)0x3 << 38;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (info.sr_write_count) {
|
||||
hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) << 36;
|
||||
hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
|
||||
hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1)
|
||||
<< ((arch >= 15) ? 35 : 36);
|
||||
if (arch >= 15)
|
||||
hex |= ((uint64_t)va_pack_reg_v15(I, I->dest[0])) << 16;
|
||||
else
|
||||
hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
|
||||
}
|
||||
|
||||
if (info.vecsize)
|
||||
|
|
@ -885,7 +1137,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
case BI_OPCODE_LOAD_I64:
|
||||
case BI_OPCODE_LOAD_I96:
|
||||
case BI_OPCODE_LOAD_I128:
|
||||
hex |= va_pack_load(I, false);
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_load_v15(I, false);
|
||||
else
|
||||
hex |= va_pack_load(I, false);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_LD_PKA_I8:
|
||||
|
|
@ -896,7 +1151,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
case BI_OPCODE_LD_PKA_I64:
|
||||
case BI_OPCODE_LD_PKA_I96:
|
||||
case BI_OPCODE_LD_PKA_I128:
|
||||
hex |= va_pack_load(I, true);
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_load_v15(I, true);
|
||||
else
|
||||
hex |= va_pack_load(I, true);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_STORE_I8:
|
||||
|
|
@ -907,20 +1165,26 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
case BI_OPCODE_STORE_I64:
|
||||
case BI_OPCODE_STORE_I96:
|
||||
case BI_OPCODE_STORE_I128:
|
||||
hex |= va_pack_store(I);
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_store_v15(I);
|
||||
else
|
||||
hex |= va_pack_store(I);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_ATOM1_RETURN_I64:
|
||||
/* Permit omitting the destination for plain ATOM1 */
|
||||
if (!bi_count_write_registers(I, 0)) {
|
||||
if (arch < 15 && !bi_count_write_registers(I, 0)) {
|
||||
hex |= (0x40ull << 40); // fake read
|
||||
}
|
||||
|
||||
/* 64-bit source */
|
||||
va_validate_register_pair(I, 0);
|
||||
hex |= (uint64_t)va_pack_src(I, 0) << 0;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 0, 0);
|
||||
else
|
||||
hex |= (uint64_t)va_pack_src(I, 0) << 0;
|
||||
hex |= va_pack_byte_offset_8(I);
|
||||
hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
|
||||
hex |= ((uint64_t)va_pack_atom_opc_1(I)) << ((arch >= 15) ? 24 : 22);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_ACMPXCHG_I64:
|
||||
|
|
@ -929,29 +1193,43 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
case BI_OPCODE_ATOM_RETURN_I64:
|
||||
/* 64-bit source */
|
||||
va_validate_register_pair(I, 1);
|
||||
hex |= (uint64_t)va_pack_src(I, 1) << 0;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 1, 0);
|
||||
else
|
||||
hex |= (uint64_t)va_pack_src(I, 1) << 0;
|
||||
hex |= va_pack_byte_offset_8(I);
|
||||
hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
|
||||
hex |= ((uint64_t)va_pack_atom_opc(I)) << ((arch >= 15) ? 24 : 22);
|
||||
|
||||
if (I->op == BI_OPCODE_ATOM_RETURN_I64)
|
||||
hex |= (0xc0ull << 40); // flags
|
||||
if (arch >= 15) {
|
||||
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) {
|
||||
/* Change bits [51;50] to be ACMPXCHG */
|
||||
pack_assert(I, ((hex >> 50) & 0b11) == 0b01);
|
||||
hex ^= (0b11ull << 50);
|
||||
}
|
||||
} else {
|
||||
if (I->op == BI_OPCODE_ATOM_RETURN_I64)
|
||||
hex |= (0xc0ull << 40); // flags
|
||||
|
||||
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
|
||||
hex |= (1 << 26); /* .compare */
|
||||
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
|
||||
hex |= (1 << 26); /* .compare */
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case BI_OPCODE_ATOM1_RETURN_I32:
|
||||
/* Permit omitting the destination for plain ATOM1 */
|
||||
if (!bi_count_write_registers(I, 0)) {
|
||||
if (arch < 15 && !bi_count_write_registers(I, 0)) {
|
||||
hex |= (0x40ull << 40); // fake read
|
||||
}
|
||||
|
||||
/* 64-bit source */
|
||||
va_validate_register_pair(I, 0);
|
||||
hex |= (uint64_t)va_pack_src(I, 0) << 0;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 0, 0);
|
||||
else
|
||||
hex |= (uint64_t)va_pack_src(I, 0) << 0;
|
||||
hex |= va_pack_byte_offset_8(I);
|
||||
hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
|
||||
hex |= ((uint64_t)va_pack_atom_opc_1(I)) << ((arch >= 15) ? 24 : 22);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_ACMPXCHG_I32:
|
||||
|
|
@ -960,41 +1238,67 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
case BI_OPCODE_ATOM_RETURN_I32:
|
||||
/* 64-bit source */
|
||||
va_validate_register_pair(I, 1);
|
||||
hex |= (uint64_t)va_pack_src(I, 1) << 0;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 1, 0);
|
||||
else
|
||||
hex |= (uint64_t)va_pack_src(I, 1) << 0;
|
||||
hex |= va_pack_byte_offset_8(I);
|
||||
hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
|
||||
hex |= ((uint64_t)va_pack_atom_opc(I)) << ((arch >= 15) ? 24 : 22);
|
||||
|
||||
if (I->op == BI_OPCODE_ATOM_RETURN_I32)
|
||||
hex |= (0xc0ull << 40); // flags
|
||||
if (arch >= 15) {
|
||||
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG) {
|
||||
/* Change bits [51;50] to be ACMPXCHG */
|
||||
pack_assert(I, ((hex >> 50) & 0b11) == 0b01);
|
||||
hex ^= (0b11ull << 50);
|
||||
}
|
||||
} else {
|
||||
if (I->op == BI_OPCODE_ATOM_RETURN_I32)
|
||||
hex |= (0xc0ull << 40); // flags
|
||||
|
||||
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
|
||||
hex |= (1 << 26); /* .compare */
|
||||
if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
|
||||
hex |= (1 << 26); /* .compare */
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case BI_OPCODE_LD_CVT:
|
||||
hex |= (uint64_t)va_pack_src(I, 0);
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 0, 0);
|
||||
else
|
||||
hex |= (uint64_t)va_pack_src(I, 0);
|
||||
hex |= va_pack_byte_offset(I);
|
||||
|
||||
/* Conversion descriptor */
|
||||
hex |= (uint64_t)va_pack_src(I, 2) << 16;
|
||||
hex |= (uint64_t)I->mem_access << 37;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 2, 2);
|
||||
else
|
||||
hex |= (uint64_t)va_pack_src(I, 2) << 16;
|
||||
hex |= (uint64_t)I->mem_access << ((arch >= 15) ? 35 : 37);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_ST_CVT:
|
||||
/* Staging read */
|
||||
va_validate_register_pair(I, 1);
|
||||
hex |= (uint64_t)va_pack_src(I, 1) << 0;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 1, 0);
|
||||
else
|
||||
hex |= (uint64_t)va_pack_src(I, 1) << 0;
|
||||
hex |= va_pack_byte_offset(I);
|
||||
|
||||
/* Conversion descriptor */
|
||||
hex |= (uint64_t)va_pack_src(I, 3) << 16;
|
||||
hex |= (uint64_t)I->mem_access << 37;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 3, 2);
|
||||
else
|
||||
hex |= (uint64_t)va_pack_src(I, 3) << 16;
|
||||
hex |= (uint64_t)I->mem_access << ((arch >= 15) ? 35 : 37);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_BLEND: {
|
||||
/* Source 0 - Blend descriptor (64-bit) */
|
||||
hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 2, 0);
|
||||
else
|
||||
hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
|
||||
va_validate_register_pair(I, 2);
|
||||
|
||||
/* Target */
|
||||
|
|
@ -1005,7 +1309,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
hex |= ((I->branch_offset >> 3) << 8);
|
||||
|
||||
/* Source 2 - coverage mask */
|
||||
hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 1, 2);
|
||||
else
|
||||
hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;
|
||||
|
||||
/* Vector size */
|
||||
unsigned vecsize = 4;
|
||||
|
|
@ -1015,7 +1322,7 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
}
|
||||
|
||||
case BI_OPCODE_LD_GCLK_U64:
|
||||
hex |= va_pack_gclk(I);
|
||||
hex |= va_pack_gclk(I) << ((arch >= 15) ? 8 : 0);
|
||||
break;
|
||||
|
||||
case BI_OPCODE_TEX_GRADIENT:
|
||||
|
|
@ -1023,7 +1330,10 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
case BI_OPCODE_TEX_FETCH:
|
||||
case BI_OPCODE_TEX_GATHER: {
|
||||
/* Image to read from */
|
||||
hex |= ((uint64_t)va_pack_src(I, 1)) << 0;
|
||||
if (arch >= 15)
|
||||
hex |= va_pack_src_v15(I, 1, 0);
|
||||
else
|
||||
hex |= ((uint64_t)va_pack_src(I, 1)) << 0;
|
||||
|
||||
if ((I->op == BI_OPCODE_TEX_FETCH || I->op == BI_OPCODE_TEX_GRADIENT) &&
|
||||
I->shadow)
|
||||
|
|
@ -1040,7 +1350,7 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
if (I->skip)
|
||||
hex |= (1ull << 39);
|
||||
if (!bi_is_regfmt_16(I->register_format))
|
||||
hex |= (1ull << 46);
|
||||
hex |= (1ull << ((arch >= 15) ? 38 : 46));
|
||||
|
||||
if (I->op == BI_OPCODE_TEX_GRADIENT) {
|
||||
if (I->force_delta_enable)
|
||||
|
|
@ -1062,20 +1372,35 @@ va_pack_instr(const bi_instr *I, unsigned arch)
|
|||
hex |= ((uint64_t)I->fetch_component) << 14;
|
||||
}
|
||||
|
||||
hex |= (I->write_mask << 22);
|
||||
hex |= (I->write_mask << ((arch >= 15) ? 24 : 22));
|
||||
hex |= ((uint64_t)I->dimension) << 28;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
if (!info.exact && I->op != BI_OPCODE_NOP)
|
||||
if (!info.exact && (arch >= 15 || I->op != BI_OPCODE_NOP))
|
||||
invalid_instruction(I, "opcode");
|
||||
|
||||
hex |= va_pack_alu(I, arch);
|
||||
break;
|
||||
}
|
||||
|
||||
/* On v15, some instrutions require an encoded null src. */
|
||||
if (arch >= 15) {
|
||||
switch (I->op) {
|
||||
case BI_OPCODE_NOP:
|
||||
case BI_OPCODE_LD_VAR_FLAT_IMM:
|
||||
case BI_OPCODE_LD_VAR_BUF_FLAT_IMM:
|
||||
case BI_OPCODE_LD_GCLK_U64:
|
||||
case BI_OPCODE_BARRIER:
|
||||
hex |= va_pack_src_null_v15(0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return hex;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
#include "valhall.h"
|
||||
|
||||
void
|
||||
va_count_instr_stats(bi_instr *I, struct va_stats *stats)
|
||||
va_count_instr_stats(bi_instr *I, unsigned arch, struct va_stats *stats)
|
||||
{
|
||||
/* Adjusted for 64-bit arithmetic */
|
||||
unsigned words = bi_count_write_registers(I, 0);
|
||||
|
|
@ -35,7 +35,7 @@ va_count_instr_stats(bi_instr *I, struct va_stats *stats)
|
|||
}
|
||||
}
|
||||
}
|
||||
switch (valhall_opcodes[I->op].unit) {
|
||||
switch (get_valhall_opcode(I->op, arch).unit) {
|
||||
/* Arithmetic is 2x slower for 64-bit than 32-bit */
|
||||
case VA_UNIT_FMA:
|
||||
stats->fma += words;
|
||||
|
|
|
|||
|
|
@ -93,7 +93,8 @@ fau_state_uniform(struct fau_state *fau, bi_index idx, enum bi_opcode op)
|
|||
}
|
||||
|
||||
static bool
|
||||
fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op)
|
||||
fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op,
|
||||
unsigned arch)
|
||||
{
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(fau->buffer); ++i) {
|
||||
bi_index buf = fau->buffer[i];
|
||||
|
|
@ -106,7 +107,7 @@ fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op)
|
|||
/* Instructions executed by the messaging unit should not encode WARP_ID or
|
||||
* anything from special page 3. */
|
||||
if (can_run_on_message_unit(op) &&
|
||||
(va_fau_page(idx.value) == 3 || idx.value == BIR_FAU_WARP_ID))
|
||||
(va_fau_page(idx.value, arch) == 3 || idx.value == BIR_FAU_WARP_ID))
|
||||
return false;
|
||||
|
||||
return fau->uniform_slot == -1 || can_use_two_fau_indices(op);
|
||||
|
|
@ -114,7 +115,7 @@ fau_state_special(struct fau_state *fau, bi_index idx, enum bi_opcode op)
|
|||
|
||||
static bool
|
||||
valid_src(struct fau_state *fau, unsigned fau_page, bi_index src,
|
||||
enum bi_opcode op)
|
||||
enum bi_opcode op, unsigned arch)
|
||||
{
|
||||
if (src.type != BI_INDEX_FAU)
|
||||
return true;
|
||||
|
|
@ -128,42 +129,42 @@ valid_src(struct fau_state *fau, unsigned fau_page, bi_index src,
|
|||
return fau_state_buffer(fau, src);
|
||||
}
|
||||
|
||||
bool valid = (fau_page == va_fau_page(src.value));
|
||||
bool valid = (fau_page == va_fau_page(src.value, arch));
|
||||
valid &= fau_state_buffer(fau, src);
|
||||
|
||||
if (src.value & BIR_FAU_UNIFORM)
|
||||
valid &= fau_state_uniform(fau, src, op);
|
||||
else if (fau_is_special(src.value))
|
||||
valid &= fau_state_special(fau, src, op);
|
||||
valid &= fau_state_special(fau, src, op, arch);
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
bool
|
||||
va_validate_fau(bi_instr *I)
|
||||
va_validate_fau(bi_instr *I, unsigned arch)
|
||||
{
|
||||
bool valid = true;
|
||||
struct fau_state fau = {.uniform_slot = -1};
|
||||
unsigned fau_page = va_select_fau_page(I);
|
||||
unsigned fau_page = va_select_fau_page(I, arch);
|
||||
|
||||
bi_foreach_src(I, s) {
|
||||
valid &= valid_src(&fau, fau_page, I->src[s], I->op);
|
||||
valid &= valid_src(&fau, fau_page, I->src[s], I->op, arch);
|
||||
}
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
void
|
||||
va_repair_fau(bi_builder *b, bi_instr *I)
|
||||
va_repair_fau(bi_builder *b, bi_instr *I, unsigned arch)
|
||||
{
|
||||
struct fau_state fau = {.uniform_slot = -1};
|
||||
unsigned fau_page = va_select_fau_page(I);
|
||||
unsigned fau_page = va_select_fau_page(I, arch);
|
||||
|
||||
bi_foreach_src(I, s) {
|
||||
struct fau_state push = fau;
|
||||
bi_index src = I->src[s];
|
||||
|
||||
if (!valid_src(&fau, fau_page, src, I->op)) {
|
||||
if (!valid_src(&fau, fau_page, src, I->op, arch)) {
|
||||
bi_replace_src(I, s, bi_mov_i32(b, bi_strip_index(src)));
|
||||
|
||||
/* Rollback update. Since the replacement move doesn't affect FAU
|
||||
|
|
@ -180,7 +181,7 @@ va_validate(FILE *fp, bi_context *ctx)
|
|||
bool errors = false;
|
||||
|
||||
bi_foreach_instr_global(ctx, I) {
|
||||
if (!va_validate_fau(I)) {
|
||||
if (!va_validate_fau(I, ctx->arch)) {
|
||||
if (!errors) {
|
||||
fprintf(fp, "Validation failed, this is a bug. Shader:\n\n");
|
||||
bi_print_shader(ctx, fp);
|
||||
|
|
|
|||
|
|
@ -97,10 +97,10 @@ valhall_opcodes[BI_NUM_OPCODES] = {
|
|||
sr_control = 0
|
||||
|
||||
if len(op.staging) > 0:
|
||||
sr_control = op.staging[0].encoded_flags >> 6
|
||||
sr_control = op.staging[0].encoded_flags
|
||||
%>
|
||||
[BI_OPCODE_${name.replace('.', '_').upper()}] = {
|
||||
.exact = ${hex(exact(op))}ULL,
|
||||
.exact = ${hex(exact(op.opcode))}ULL,
|
||||
.srcs = {
|
||||
% for src in ([sr for sr in op.staging if sr.read] + op.srcs):
|
||||
{
|
||||
|
|
@ -141,12 +141,84 @@ valhall_opcodes[BI_NUM_OPCODES] = {
|
|||
% endif
|
||||
% endfor
|
||||
};
|
||||
|
||||
const struct va_opcode_info
|
||||
valhall_v15_opcodes[BI_NUM_OPCODES] = {
|
||||
% for op in instructions:
|
||||
% if op.name not in skip:
|
||||
<%
|
||||
name = op.name
|
||||
if name == 'BRANCHZ':
|
||||
name = 'BRANCHZ.i16'
|
||||
|
||||
sr_control = 0
|
||||
|
||||
if len(op.staging) > 0:
|
||||
sr_control = op.staging[0].encoded_flags
|
||||
%>
|
||||
[BI_OPCODE_${name.replace('.', '_').upper()}] = {
|
||||
.exact = ${hex(exact(op.opcode_v15))}ULL,
|
||||
.srcs = {
|
||||
% for src in ([sr for sr in op.staging if sr.read] + op.srcs):
|
||||
{
|
||||
.absneg = ${ibool(src.absneg)},
|
||||
.swizzle = ${ibool(src.swizzle)},
|
||||
.notted = ${ibool(src.notted)},
|
||||
.widen = ${ibool(src.widen)},
|
||||
.lanes = ${ibool(src.lanes)},
|
||||
.halfswizzle = ${ibool(src.halfswizzle)},
|
||||
.lane = ${ibool(src.lane)},
|
||||
.combine = ${ibool(src.combine)},
|
||||
% if src.size in [8, 16, 32, 64]:
|
||||
.size = VA_SIZE_${src.size},
|
||||
% endif
|
||||
},
|
||||
% endfor
|
||||
},
|
||||
.type_size = ${typesize(op.name)},
|
||||
.has_dest = ${ibool(len(op.dests) > 0)},
|
||||
.is_signed = ${ibool(op.is_signed)},
|
||||
.unit = VA_UNIT_${op.unit},
|
||||
.nr_srcs = ${len(op.srcs)},
|
||||
.nr_staging_srcs = ${sum([sr.read for sr in op.staging])},
|
||||
.nr_staging_dests = ${sum([sr.write for sr in op.staging])},
|
||||
.clamp = ${hasmod(x, 'clamp')},
|
||||
.saturate = ${hasmod(x, 'saturate')},
|
||||
.rhadd = ${hasmod(x, 'rhadd')},
|
||||
.round_mode = ${hasmod(x, 'round_mode')},
|
||||
.condition = ${hasmod(x, 'condition')},
|
||||
.result_type = ${hasmod(x, 'result_type')},
|
||||
.vecsize = ${hasmod(x, 'vector_size')},
|
||||
.register_format = ${hasmod(x, 'register_format')},
|
||||
.slot = ${hasmod(x, 'slot')},
|
||||
.sr_count = ${hasmod(x, 'staging_register_count')},
|
||||
.sr_write_count = ${hasmod(x, 'staging_register_write_count')},
|
||||
.sr_control = ${sr_control},
|
||||
},
|
||||
% endif
|
||||
% endfor
|
||||
};
|
||||
|
||||
const struct va_opcode_info
|
||||
get_valhall_opcode(enum bi_opcode op, unsigned arch)
|
||||
{
|
||||
assert(arch >= 9);
|
||||
if (arch < 15)
|
||||
return valhall_opcodes[op];
|
||||
else
|
||||
return valhall_v15_opcodes[op];
|
||||
}
|
||||
"""
|
||||
|
||||
# Exact value to be ORed in to every opcode
|
||||
def exact_op(op):
|
||||
def exact_op(opcode):
|
||||
exact_op = 0
|
||||
for subcode in op.opcode:
|
||||
|
||||
# Need an early return in case of removed instructions
|
||||
if not opcode:
|
||||
return exact_op
|
||||
|
||||
for subcode in opcode:
|
||||
exact_op |= (subcode.value << subcode.start)
|
||||
return exact_op
|
||||
|
||||
|
|
|
|||
|
|
@ -89,7 +89,8 @@ struct va_opcode_info {
|
|||
unsigned sr_control : 2;
|
||||
};
|
||||
|
||||
extern const struct va_opcode_info valhall_opcodes[BI_NUM_OPCODES];
|
||||
const struct va_opcode_info get_valhall_opcode(enum bi_opcode op,
|
||||
unsigned arch);
|
||||
|
||||
/* Bifrost specifies the source of bitwise operations as (A, B, shift), but
|
||||
* Valhall specifies (A, shift, B). We follow Bifrost conventions in the
|
||||
|
|
@ -130,10 +131,10 @@ va_swap_12(enum bi_opcode op)
|
|||
}
|
||||
|
||||
static inline struct va_src_info
|
||||
va_src_info(enum bi_opcode op, unsigned src)
|
||||
va_src_info(enum bi_opcode op, unsigned src, unsigned arch)
|
||||
{
|
||||
unsigned idx = (va_swap_12(op) && (src == 1 || src == 2)) ? (3 - src) : src;
|
||||
return valhall_opcodes[op].srcs[idx];
|
||||
return get_valhall_opcode(op, arch).srcs[idx];
|
||||
}
|
||||
|
||||
static inline bool
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ import sys
|
|||
instructions = []
|
||||
|
||||
MODIFIERS = {}
|
||||
MODIFIERS_V15 = {}
|
||||
enums = {}
|
||||
immediates = []
|
||||
|
||||
|
|
@ -102,6 +103,11 @@ class Source:
|
|||
self.offset['value'] = self.start
|
||||
self.mask['value'] = bitmask(6)
|
||||
|
||||
self.offset['high1_v15'] = (index + 48)
|
||||
self.mask['high1_v15'] = bitmask(1)
|
||||
self.offset['low8_v15'] = self.start
|
||||
self.mask['low8_v15'] = bitmask(8)
|
||||
|
||||
if absneg:
|
||||
self.offset['neg'] = 32 + 2 + ((2 - index) * 2)
|
||||
self.offset['abs'] = 33 + 2 + ((2 - index) * 2)
|
||||
|
|
@ -137,6 +143,11 @@ class Dest:
|
|||
self.offset['value'] = self.start
|
||||
self.mask['value'] = bitmask(6)
|
||||
|
||||
self.offset['mode_v15'] = self.start + 13
|
||||
self.mask['mode_v15'] = bitmask(2)
|
||||
self.offset['value_v15'] = self.start
|
||||
self.mask['value_v15'] = bitmask(8)
|
||||
|
||||
class Staging:
|
||||
def __init__(self, read = False, write = False, count = 0, flags = 'true', name = ""):
|
||||
self.name = name
|
||||
|
|
@ -152,6 +163,14 @@ class Staging:
|
|||
|
||||
self.offset['value'] = self.start
|
||||
self.mask['value'] = bitmask(6)
|
||||
self.offset['flags'] = self.start + 6
|
||||
self.mask['flags'] = bitmask(2)
|
||||
|
||||
self.offset['value_v15'] = self.start
|
||||
self.mask['value_v15'] = bitmask(8)
|
||||
self.offset['flags_v15'] = 38
|
||||
self.mask['flags_v15'] = bitmask(2)
|
||||
|
||||
|
||||
# For compatibility
|
||||
self.absneg = False
|
||||
|
|
@ -166,11 +185,14 @@ class Staging:
|
|||
|
||||
if not self.flags:
|
||||
self.encoded_flags = 0
|
||||
self.encoded_flags_v15 = 0
|
||||
elif flags == 'rw':
|
||||
self.encoded_flags = 0xc0
|
||||
self.encoded_flags = 0b11
|
||||
self.encoded_flags_v15 = 0b11
|
||||
else:
|
||||
assert(flags == 'true')
|
||||
self.encoded_flags = (0x80 if write else 0) | (0x40 if read else 0)
|
||||
self.encoded_flags = (0b10 if write else 0) | (0b01 if read else 0)
|
||||
self.encoded_flags_v15 = (0b10 if read else 0) | (0b01 if read and write else 0)
|
||||
|
||||
class Immediate:
|
||||
def __init__(self, name, start, size, signed):
|
||||
|
|
@ -186,13 +208,16 @@ class Opcode:
|
|||
self.mask = mask
|
||||
|
||||
class Instruction:
|
||||
def __init__(self, name, opcode, srcs = [], dests = [], immediates = [], modifiers = [], staging = None, unit = None):
|
||||
def __init__(self, name, opcode, opcode_v15, srcs = [], dests = [], immediates = [], immediates_v15 = [], modifiers = [], modifiers_v15 = [], staging = None, unit = None):
|
||||
self.name = name
|
||||
self.srcs = srcs
|
||||
self.dests = dests
|
||||
self.opcode = opcode
|
||||
self.opcode_v15 = opcode_v15
|
||||
self.immediates = immediates
|
||||
self.immediates_v15 = immediates_v15
|
||||
self.modifiers = modifiers
|
||||
self.modifiers_v15 = modifiers_v15
|
||||
self.staging = staging
|
||||
self.unit = unit
|
||||
self.is_signed = len(name.split(".")) > 1 and ('s' in name.split(".")[1])
|
||||
|
|
@ -205,6 +230,11 @@ class Instruction:
|
|||
self.offset['fau_page'] = 57
|
||||
self.mask['fau_page'] = bitmask(2)
|
||||
|
||||
self.offset['flow_v15'] = 58
|
||||
self.mask['flow_v15'] = bitmask(4)
|
||||
self.offset['fau_page_v15'] = 62
|
||||
self.mask['fau_page_v15'] = bitmask(2)
|
||||
|
||||
# Message-passing instruction <===> not ALU instruction
|
||||
self.message = unit not in ["FMA", "CVT", "SFU"]
|
||||
|
||||
|
|
@ -273,6 +303,7 @@ def build_instr(el, overrides = {}):
|
|||
# Get overridables
|
||||
name = overrides.get('name') or el.attrib.get('name')
|
||||
opcode = overrides.get('opcode') or build_opcode(el, 'opcode')
|
||||
opcode_v15 = overrides.get('opcode_v15') or build_opcode(el, 'opcode_v15')
|
||||
unit = overrides.get('unit') or el.attrib.get('unit')
|
||||
|
||||
# Get explicit sources/dests
|
||||
|
|
@ -304,15 +335,25 @@ def build_instr(el, overrides = {}):
|
|||
|
||||
# Get immediates
|
||||
imms = [build_imm(imm) for imm in el.findall('imm')]
|
||||
imms_v15 = [build_imm(imm) for imm in el.findall('imm_v15_override')]
|
||||
for imm in imms:
|
||||
if imm.name not in {imm.name for imm in imms_v15}:
|
||||
imms_v15.append(imm)
|
||||
|
||||
modifiers = []
|
||||
modifiers_v15 = []
|
||||
for mod in el:
|
||||
if (mod.tag in MODIFIERS) and not (mod.attrib.get('pseudo', False)):
|
||||
modifiers.append(MODIFIERS[mod.tag])
|
||||
modifiers_v15.append(MODIFIERS_V15[mod.tag])
|
||||
elif mod.tag =='va_mod':
|
||||
modifiers.append(build_modifier(mod))
|
||||
elif mod.tag =='va_mod_v15':
|
||||
modifiers_v15.append(build_modifier(mod))
|
||||
|
||||
instr = Instruction(name, opcode, srcs = sources, dests = dests, immediates = imms, modifiers = modifiers, staging = staging, unit = unit)
|
||||
|
||||
instr = Instruction(name, opcode, opcode_v15, srcs = sources, dests = dests, immediates = imms, immediates_v15 = imms_v15,
|
||||
modifiers = modifiers, modifiers_v15 = modifiers_v15, staging = staging, unit = unit)
|
||||
|
||||
instructions.append(instr)
|
||||
|
||||
|
|
@ -323,6 +364,7 @@ def build_group(el):
|
|||
build_instr(el, overrides = {
|
||||
'name': ins.attrib['name'],
|
||||
'opcode': build_opcode(ins, 'opcode'),
|
||||
'opcode_v15': build_opcode(ins, 'opcode_v15'),
|
||||
'unit': ins.attrib.get('unit'),
|
||||
})
|
||||
|
||||
|
|
@ -377,6 +419,7 @@ def typesize(name):
|
|||
# Parse the ISA
|
||||
def valhall_parse_isa(xmlfile):
|
||||
global MODIFIERS
|
||||
global MODIFIERS_V15
|
||||
global enums
|
||||
global immediates
|
||||
global root
|
||||
|
|
@ -404,7 +447,6 @@ def valhall_parse_isa(xmlfile):
|
|||
"lod_bias_disable": Modifier("lod_mode", 13, 1),
|
||||
"lod_clamp_disable": Modifier("lod_mode", 14, 1),
|
||||
"write_mask": Modifier("write_mask", 22, 4),
|
||||
"register_type": Modifier("register_type", 26, 2),
|
||||
"dimension": Modifier("dimension", 28, 2),
|
||||
"skip": Flag("skip", 39),
|
||||
"register_width": Modifier("register_width", 46, 1, force_enum = "register_width"),
|
||||
|
|
@ -438,6 +480,52 @@ def valhall_parse_isa(xmlfile):
|
|||
"sample": Modifier("sample_mode", 38, 2),
|
||||
}
|
||||
|
||||
MODIFIERS_V15 = {
|
||||
# Texture instructions share a common encoding
|
||||
"wide_indices": Flag("wide_indices", 8),
|
||||
"array_enable": Flag("array_enable", 10),
|
||||
"texel_offset": Flag("texel_offset", 11),
|
||||
"shadow": Flag("shadow", 12),
|
||||
"integer_coordinates": Flag("integer_coordinates", 13),
|
||||
"fetch_component": Modifier("fetch_component", 14, 2),
|
||||
"lod_mode": Modifier("lod_mode", 13, 3),
|
||||
"lod_bias_disable": Modifier("lod_mode", 13, 1),
|
||||
"lod_clamp_disable": Modifier("lod_mode", 14, 1),
|
||||
"write_mask": Modifier("write_mask", 24, 4),
|
||||
"dimension": Modifier("dimension", 28, 2),
|
||||
"skip": Flag("skip", 39),
|
||||
"register_width": Modifier("register_width", 38, 1, force_enum = "register_width"),
|
||||
"secondary_register_width": Modifier("secondary_register_width", 54, 1, force_enum = "register_width"),
|
||||
"vartex_register_width": Modifier("varying_texture_register_width", 24, 2),
|
||||
|
||||
"atom_opc": Modifier("atomic_operation", 24, 4),
|
||||
"atom_opc_1": Modifier("atomic_operation_with_1", 24, 3),
|
||||
"inactive_result": Modifier("inactive_result", 22, 4),
|
||||
"memory_access": Modifier("memory_access", 24, 2),
|
||||
"regfmt": Modifier("register_format", 24, 3),
|
||||
"source_format": Modifier("source_format", 24, 2),
|
||||
"vecsize": Modifier("vector_size", 28, 2),
|
||||
|
||||
"slot": Modifier("slot_v15", 30, 2),
|
||||
"roundmode": Modifier("round_mode", 32, 2),
|
||||
"result_type": Modifier("result_type", 24, 2),
|
||||
"saturate": Flag("saturate", 25),
|
||||
"not_result": Flag("not_result", 34),
|
||||
|
||||
"lane_op": Modifier("lane_operation", 32, 4),
|
||||
"cmp": Modifier("condition", 33, 3),
|
||||
"clamp": Modifier("clamp", 30, 2),
|
||||
"sr_count": Modifier("staging_register_count", 32, 3, implied = True),
|
||||
"sample_and_update": Modifier("sample_and_update_mode", 32, 3),
|
||||
"sr_write_count": Modifier("staging_register_write_count", 35, 3, implied = True),
|
||||
|
||||
"conservative": Flag("conservative", 35),
|
||||
"subgroup": Modifier("subgroup_size", 36, 4),
|
||||
"update": Modifier("update_mode", 35, 2),
|
||||
"sample": Modifier("sample_mode", 37, 2),
|
||||
}
|
||||
|
||||
|
||||
for child in root:
|
||||
if child.tag == 'group':
|
||||
build_group(child)
|
||||
|
|
|
|||
|
|
@ -52,8 +52,10 @@ pan_get_nir_shader_compiler_options(unsigned arch, bool merge_wg)
|
|||
case 11:
|
||||
case 12:
|
||||
case 13:
|
||||
return merge_wg ? &bifrost_nir_options_v11_merge_wg :
|
||||
&bifrost_nir_options_v11;
|
||||
case 14:
|
||||
case 15:
|
||||
return merge_wg ? &bifrost_nir_options_v11_merge_wg
|
||||
: &bifrost_nir_options_v11;
|
||||
default:
|
||||
assert(!"Unsupported arch");
|
||||
return NULL;
|
||||
|
|
@ -285,7 +287,8 @@ pan_disassemble(FILE *fp, const void *code, size_t size, uint64_t gpu_id,
|
|||
bool verbose)
|
||||
{
|
||||
if (pan_arch(gpu_id) >= 9)
|
||||
disassemble_valhall(fp, (const uint64_t *)code, size, verbose);
|
||||
disassemble_valhall(fp, (const uint64_t *)code, size, pan_arch(gpu_id),
|
||||
verbose);
|
||||
else if (pan_arch(gpu_id) >= 6)
|
||||
disassemble_bifrost(fp, code, size, verbose);
|
||||
else
|
||||
|
|
|
|||
|
|
@ -824,7 +824,11 @@ cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
|
|||
case MALI_CS_OPCODE_STORE_MULTIPLE:
|
||||
case MALI_CS_OPCODE_RUN_COMPUTE:
|
||||
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
|
||||
#if PAN_ARCH >= 14
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT2:
|
||||
#else
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT:
|
||||
#endif
|
||||
case MALI_CS_OPCODE_RUN_FULLSCREEN:
|
||||
#if PAN_ARCH >= 12
|
||||
case MALI_CS_OPCODE_RUN_IDVS2:
|
||||
|
|
@ -1614,6 +1618,22 @@ cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool malloc_enable,
|
|||
}
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
static inline void
|
||||
cs_run_fragment2(struct cs_builder *b, bool enable_tem,
|
||||
enum mali_tile_render_order tile_order)
|
||||
{
|
||||
/* Staging regs */
|
||||
cs_flush_loads(b);
|
||||
|
||||
b->req_resource_mask |= CS_FRAG_RES;
|
||||
|
||||
cs_emit(b, RUN_FRAGMENT2, I) {
|
||||
I.enable_tem = enable_tem;
|
||||
I.tile_order = tile_order;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static inline void
|
||||
cs_run_fragment(struct cs_builder *b, bool enable_tem,
|
||||
enum mali_tile_render_order tile_order)
|
||||
|
|
@ -1628,6 +1648,7 @@ cs_run_fragment(struct cs_builder *b, bool enable_tem,
|
|||
I.tile_order = tile_order;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void
|
||||
cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
|
||||
|
|
@ -2469,6 +2490,53 @@ cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
|
|||
(int16_t)(offsetof(struct cs_##__type##_trace, __field) - \
|
||||
sizeof(struct cs_##__type##_trace))
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
#define CS_RUN_FRAGMENT2_SR_COUNT 56
|
||||
#define CS_RUN_FRAGMENT2_SR_MASK BITFIELD64_RANGE(0, CS_RUN_FRAGMENT2_SR_COUNT)
|
||||
struct cs_run_fragment2_trace {
|
||||
uint64_t ip;
|
||||
uint32_t sr[CS_RUN_FRAGMENT2_SR_COUNT];
|
||||
} __attribute__((aligned(64)));
|
||||
|
||||
static inline void
|
||||
cs_trace_run_fragment2(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
|
||||
struct cs_index scratch_regs, bool enable_tem,
|
||||
enum mali_tile_render_order tile_order)
|
||||
{
|
||||
if (likely(!ctx->enabled)) {
|
||||
cs_run_fragment2(b, enable_tem, tile_order);
|
||||
return;
|
||||
}
|
||||
|
||||
struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
|
||||
struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
|
||||
|
||||
cs_trace_preamble(b, ctx, scratch_regs,
|
||||
sizeof(struct cs_run_fragment2_trace));
|
||||
|
||||
/* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
|
||||
* won't point to the right instruction. */
|
||||
cs_load_ip_to(b, data);
|
||||
cs_run_fragment2(b, enable_tem, tile_order);
|
||||
cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment2, ip));
|
||||
|
||||
ASSERTED unsigned sr_count = 0;
|
||||
unsigned sr_offset = cs_trace_field_offset(run_fragment2, sr);
|
||||
for (unsigned i = 0; i < CS_RUN_FRAGMENT2_SR_COUNT; i += 16) {
|
||||
unsigned mask = (CS_RUN_FRAGMENT2_SR_MASK >> i) & BITFIELD_MASK(16);
|
||||
if (!mask)
|
||||
continue;
|
||||
|
||||
cs_store(b, cs_reg_tuple(b, i, util_last_bit(mask)), tracebuf_addr, mask,
|
||||
sr_offset);
|
||||
sr_offset += util_bitcount(mask) * sizeof(uint32_t);
|
||||
sr_count += util_bitcount(mask);
|
||||
}
|
||||
assert(sr_count == CS_RUN_FRAGMENT2_SR_COUNT);
|
||||
|
||||
cs_flush_stores(b);
|
||||
}
|
||||
#else
|
||||
struct cs_run_fragment_trace {
|
||||
uint64_t ip;
|
||||
uint32_t sr[7];
|
||||
|
|
@ -2500,6 +2568,7 @@ cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
|
|||
cs_trace_field_offset(run_fragment, sr));
|
||||
cs_flush_stores(b);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 13
|
||||
#define CS_RUN_FULLSCREEN_SR_MASK \
|
||||
|
|
|
|||
|
|
@ -152,22 +152,22 @@ pandecode_rt(struct pandecode_context *ctx, unsigned index, uint64_t gpu_va)
|
|||
|
||||
}
|
||||
|
||||
static void
|
||||
pandecode_rts(struct pandecode_context *ctx, uint64_t gpu_va,
|
||||
const struct MALI_FRAMEBUFFER_PARAMETERS *fb)
|
||||
void
|
||||
GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va,
|
||||
uint32_t render_target_count)
|
||||
{
|
||||
pandecode_log(ctx, "Color Render Targets @%" PRIx64 ":\n", gpu_va);
|
||||
ctx->indent++;
|
||||
|
||||
for (int i = 0; i < (fb->render_target_count); i++)
|
||||
for (int i = 0; i < render_target_count; i++)
|
||||
pandecode_rt(ctx, i, gpu_va);
|
||||
|
||||
ctx->indent--;
|
||||
pandecode_log(ctx, "\n");
|
||||
}
|
||||
|
||||
static void
|
||||
pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va)
|
||||
void
|
||||
GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va)
|
||||
{
|
||||
const struct mali_zs_crc_extension_packed *PANDECODE_PTR_VAR(
|
||||
ctx, zs_crc_packed, (uint64_t)gpu_va);
|
||||
|
|
@ -223,22 +223,65 @@ pandecode_zs_crc_ext(struct pandecode_context *ctx, uint64_t gpu_va)
|
|||
|
||||
|
||||
#if PAN_ARCH >= 6
|
||||
static void
|
||||
pandecode_sample_locations(struct pandecode_context *ctx, const void *fb)
|
||||
void
|
||||
GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx,
|
||||
uint64_t dcd_pointer, unsigned pre_frame_0,
|
||||
unsigned pre_frame_1, unsigned post_frame,
|
||||
unsigned job_type_param, uint64_t gpu_id)
|
||||
{
|
||||
pan_section_unpack(fb, FRAMEBUFFER, PARAMETERS, params);
|
||||
const unsigned dcd_size = pan_size(DRAW);
|
||||
|
||||
const uint16_t *PANDECODE_PTR_VAR(ctx, samples, params.sample_locations);
|
||||
if (pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
|
||||
const struct mali_draw_packed *PANDECODE_PTR_VAR(
|
||||
ctx, dcd, dcd_pointer + (0 * dcd_size));
|
||||
pan_unpack(dcd, DRAW, draw)
|
||||
;
|
||||
pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n", dcd_pointer,
|
||||
pre_frame_0);
|
||||
ctx->indent++;
|
||||
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
|
||||
ctx->indent--;
|
||||
}
|
||||
|
||||
pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n",
|
||||
params.sample_locations);
|
||||
if (pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
|
||||
const struct mali_draw_packed *PANDECODE_PTR_VAR(
|
||||
ctx, dcd, dcd_pointer + (1 * dcd_size));
|
||||
pan_unpack(dcd, DRAW, draw)
|
||||
;
|
||||
pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n",
|
||||
dcd_pointer + (1 * dcd_size));
|
||||
ctx->indent++;
|
||||
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
|
||||
ctx->indent--;
|
||||
}
|
||||
|
||||
if (post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
|
||||
const struct mali_draw_packed *PANDECODE_PTR_VAR(
|
||||
ctx, dcd, dcd_pointer + (2 * dcd_size));
|
||||
pan_unpack(dcd, DRAW, draw)
|
||||
;
|
||||
pandecode_log(ctx, "Post frame:\n");
|
||||
ctx->indent++;
|
||||
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
|
||||
ctx->indent--;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GENX(pandecode_sample_locations)(struct pandecode_context *ctx,
|
||||
uint64_t sample_locations)
|
||||
{
|
||||
const uint16_t *PANDECODE_PTR_VAR(ctx, samples, sample_locations);
|
||||
|
||||
pandecode_log(ctx, "Sample locations @%" PRIx64 ":\n", sample_locations);
|
||||
for (int i = 0; i < 33; i++) {
|
||||
pandecode_log(ctx, " (%d, %d),\n", samples[2 * i] - 128,
|
||||
samples[2 * i + 1] - 128);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif /* PAN_ARCH >= 6 */
|
||||
|
||||
#if PAN_ARCH < 14
|
||||
struct pandecode_fbd
|
||||
GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
|
||||
bool is_fragment, uint64_t gpu_id)
|
||||
|
|
@ -248,46 +291,17 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
|
|||
DUMP_UNPACKED(ctx, FRAMEBUFFER_PARAMETERS, params, "Parameters:\n");
|
||||
|
||||
#if PAN_ARCH >= 6
|
||||
pandecode_sample_locations(ctx, fb);
|
||||
GENX(pandecode_sample_locations)(ctx, params.sample_locations);
|
||||
|
||||
unsigned dcd_size = pan_size(DRAW);
|
||||
unsigned job_type_param = 0;
|
||||
|
||||
#if PAN_ARCH <= 9
|
||||
job_type_param = MALI_JOB_TYPE_FRAGMENT;
|
||||
#endif
|
||||
|
||||
if (params.pre_frame_0 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
|
||||
const struct mali_draw_packed *PANDECODE_PTR_VAR(
|
||||
ctx, dcd, params.frame_shader_dcds + (0 * dcd_size));
|
||||
pan_unpack(dcd, DRAW, draw);
|
||||
pandecode_log(ctx, "Pre frame 0 @%" PRIx64 " (mode=%d):\n",
|
||||
params.frame_shader_dcds, params.pre_frame_0);
|
||||
ctx->indent++;
|
||||
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
|
||||
ctx->indent--;
|
||||
}
|
||||
|
||||
if (params.pre_frame_1 != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
|
||||
const struct mali_draw_packed *PANDECODE_PTR_VAR(
|
||||
ctx, dcd, params.frame_shader_dcds + (1 * dcd_size));
|
||||
pan_unpack(dcd, DRAW, draw);
|
||||
pandecode_log(ctx, "Pre frame 1 @%" PRIx64 ":\n",
|
||||
params.frame_shader_dcds + (1 * dcd_size));
|
||||
ctx->indent++;
|
||||
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
|
||||
ctx->indent--;
|
||||
}
|
||||
|
||||
if (params.post_frame != MALI_PRE_POST_FRAME_SHADER_MODE_NEVER) {
|
||||
const struct mali_draw_packed *PANDECODE_PTR_VAR(
|
||||
ctx, dcd, params.frame_shader_dcds + (2 * dcd_size));
|
||||
pan_unpack(dcd, DRAW, draw);
|
||||
pandecode_log(ctx, "Post frame:\n");
|
||||
ctx->indent++;
|
||||
GENX(pandecode_dcd)(ctx, &draw, job_type_param, gpu_id);
|
||||
ctx->indent--;
|
||||
}
|
||||
GENX(pandecode_frame_shader_dcds)
|
||||
(ctx, params.frame_shader_dcds, params.pre_frame_0, params.pre_frame_1,
|
||||
params.post_frame, job_type_param, gpu_id);
|
||||
#else
|
||||
DUMP_SECTION(ctx, FRAMEBUFFER, LOCAL_STORAGE, fb, "Local Storage:\n");
|
||||
|
||||
|
|
@ -312,13 +326,13 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
|
|||
gpu_va += pan_size(FRAMEBUFFER);
|
||||
|
||||
if (params.has_zs_crc_extension) {
|
||||
pandecode_zs_crc_ext(ctx, gpu_va);
|
||||
GENX(pandecode_zs_crc_ext)(ctx, gpu_va);
|
||||
|
||||
gpu_va += pan_size(ZS_CRC_EXTENSION);
|
||||
}
|
||||
|
||||
if (is_fragment)
|
||||
pandecode_rts(ctx, gpu_va, ¶ms);
|
||||
GENX(pandecode_rts)(ctx, gpu_va, params.render_target_count);
|
||||
|
||||
return (struct pandecode_fbd){
|
||||
.rt_count = params.render_target_count,
|
||||
|
|
@ -336,6 +350,7 @@ GENX(pandecode_fbd)(struct pandecode_context *ctx, uint64_t gpu_va,
|
|||
};
|
||||
#endif
|
||||
}
|
||||
#endif /* PAN_ARCH < 14 */
|
||||
|
||||
#if PAN_ARCH >= 5
|
||||
uint64_t
|
||||
|
|
|
|||
|
|
@ -132,6 +132,20 @@ void pandecode_cs_binary_v13(struct pandecode_context *ctx, uint64_t bin,
|
|||
void pandecode_cs_trace_v13(struct pandecode_context *ctx, uint64_t trace,
|
||||
uint32_t trace_size, uint64_t gpu_id);
|
||||
|
||||
void pandecode_interpret_cs_v14(struct pandecode_context *ctx, uint64_t queue,
|
||||
uint32_t size, uint64_t gpu_id, uint32_t *regs);
|
||||
void pandecode_cs_binary_v14(struct pandecode_context *ctx, uint64_t bin,
|
||||
uint32_t bin_size);
|
||||
void pandecode_cs_trace_v14(struct pandecode_context *ctx, uint64_t trace,
|
||||
uint32_t trace_size, uint64_t gpu_id);
|
||||
|
||||
void pandecode_interpret_cs_v15(struct pandecode_context *ctx, uint64_t queue,
|
||||
uint32_t size, uint64_t gpu_id, uint32_t *regs);
|
||||
void pandecode_cs_binary_v15(struct pandecode_context *ctx, uint64_t bin,
|
||||
uint32_t bin_size);
|
||||
void pandecode_cs_trace_v15(struct pandecode_context *ctx, uint64_t trace,
|
||||
uint32_t trace_size, uint64_t gpu_id);
|
||||
|
||||
/* Logging infrastructure */
|
||||
static void
|
||||
pandecode_make_indent(struct pandecode_context *ctx)
|
||||
|
|
@ -275,4 +289,22 @@ void GENX(pandecode_depth_stencil)(struct pandecode_context *ctx,
|
|||
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 6
|
||||
void GENX(pandecode_sample_locations)(struct pandecode_context *ctx,
|
||||
uint64_t sample_locations);
|
||||
|
||||
void
|
||||
GENX(pandecode_frame_shader_dcds)(struct pandecode_context *ctx,
|
||||
uint64_t dcd_pointer, unsigned pre_frame_0,
|
||||
unsigned pre_frame_1, unsigned post_frame,
|
||||
unsigned job_type_param, uint64_t gpu_id);
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 5
|
||||
void GENX(pandecode_rts)(struct pandecode_context *ctx, uint64_t gpu_va,
|
||||
uint32_t render_target_count);
|
||||
|
||||
void GENX(pandecode_zs_crc_ext)(struct pandecode_context *ctx, uint64_t gpu_va);
|
||||
#endif
|
||||
|
||||
#endif /* __MMAP_TRACE_H__ */
|
||||
|
|
|
|||
|
|
@ -423,6 +423,12 @@ pandecode_interpret_cs(struct pandecode_context *ctx, uint64_t queue_gpu_va,
|
|||
case 13:
|
||||
pandecode_interpret_cs_v13(ctx, queue_gpu_va, size, gpu_id, regs);
|
||||
break;
|
||||
case 14:
|
||||
pandecode_interpret_cs_v14(ctx, queue_gpu_va, size, gpu_id, regs);
|
||||
break;
|
||||
case 15:
|
||||
pandecode_interpret_cs_v15(ctx, queue_gpu_va, size, gpu_id, regs);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("Unsupported architecture");
|
||||
}
|
||||
|
|
@ -446,6 +452,12 @@ pandecode_cs_binary(struct pandecode_context *ctx, uint64_t bin_gpu_va,
|
|||
case 13:
|
||||
pandecode_cs_binary_v13(ctx, bin_gpu_va, size);
|
||||
break;
|
||||
case 14:
|
||||
pandecode_cs_binary_v14(ctx, bin_gpu_va, size);
|
||||
break;
|
||||
case 15:
|
||||
pandecode_cs_binary_v15(ctx, bin_gpu_va, size);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("Unsupported architecture");
|
||||
}
|
||||
|
|
@ -469,6 +481,12 @@ pandecode_cs_trace(struct pandecode_context *ctx, uint64_t trace_gpu_va,
|
|||
case 13:
|
||||
pandecode_cs_trace_v13(ctx, trace_gpu_va, size, gpu_id);
|
||||
break;
|
||||
case 14:
|
||||
pandecode_cs_trace_v14(ctx, trace_gpu_va, size, gpu_id);
|
||||
break;
|
||||
case 15:
|
||||
pandecode_cs_trace_v15(ctx, trace_gpu_va, size, gpu_id);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE("Unsupported architecture");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2022-2023 Collabora, Ltd.
|
||||
* Copyright (C) 2026 Arm Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
|
|
@ -117,8 +118,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
|
|||
|
||||
case MALI_CS_OPCODE_WAIT: {
|
||||
cs_unpack(instr, CS_WAIT, I);
|
||||
fprintf(fp, "WAIT%s #%x", I.progress_increment ? ".progress_inc" : "",
|
||||
I.wait_mask);
|
||||
fprintf(fp, "WAIT #%x", I.wait_mask);
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
@ -130,15 +130,13 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
|
|||
* since we'll print them implicitly later.
|
||||
*/
|
||||
#if PAN_ARCH >= 12
|
||||
fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
|
||||
I.progress_increment ? ".progress_inc" : "", axes[I.task_axis],
|
||||
I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
|
||||
I.task_increment, I.ep_limit);
|
||||
fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
|
||||
axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select,
|
||||
I.fau_select, I.task_increment, I.ep_limit);
|
||||
#else
|
||||
fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u",
|
||||
I.progress_increment ? ".progress_inc" : "", axes[I.task_axis],
|
||||
I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
|
||||
I.task_increment);
|
||||
fprintf(fp, "RUN_COMPUTE.%s.srt%d.spd%d.tsd%d.fau%d #%u",
|
||||
axes[I.task_axis], I.srt_select, I.spd_select, I.tsd_select,
|
||||
I.fau_select, I.task_increment);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
|
@ -146,8 +144,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
|
|||
#if PAN_ARCH == 10
|
||||
case MALI_CS_OPCODE_RUN_TILING: {
|
||||
cs_unpack(instr, CS_RUN_TILING, I);
|
||||
fprintf(fp, "RUN_TILING%s.srt%d.spd%d.tsd%d.fau%d",
|
||||
I.progress_increment ? ".progress_inc" : "", I.srt_select,
|
||||
fprintf(fp, "RUN_TILING.srt%d.spd%d.tsd%d.fau%d", I.srt_select,
|
||||
I.spd_select, I.tsd_select, I.fau_select);
|
||||
break;
|
||||
}
|
||||
|
|
@ -158,8 +155,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
|
|||
cs_unpack(instr, CS_RUN_IDVS, I);
|
||||
fprintf(
|
||||
fp,
|
||||
"RUN_IDVS%s%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64,
|
||||
I.progress_increment ? ".progress_inc" : "",
|
||||
"RUN_IDVS%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%" PRIx64,
|
||||
I.malloc_enable ? "" : ".no_malloc",
|
||||
I.draw_id_register_enable ? ".draw_id_enable" : "",
|
||||
I.varying_srt_select, I.varying_fau_select, I.varying_tsd_select,
|
||||
|
|
@ -178,8 +174,7 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
|
|||
".INVALID",
|
||||
};
|
||||
|
||||
fprintf(fp, "RUN_IDVS2%s%s%s%s r%u, #%" PRIx64,
|
||||
I.progress_increment ? ".progress_inc" : "",
|
||||
fprintf(fp, "RUN_IDVS2%s%s%s r%u, #%" PRIx64,
|
||||
I.malloc_enable ? "" : ".no_malloc",
|
||||
I.draw_id_register_enable ? ".draw_id_enable" : "",
|
||||
vertex_shading_str[I.vertex_shading_mode], I.draw_id,
|
||||
|
|
@ -318,31 +313,36 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
|
|||
case MALI_CS_OPCODE_SHARED_SB_INC: {
|
||||
cs_unpack(instr, CS_SHARED_SB_INC, I);
|
||||
|
||||
const char *progress_increment_name[] = {
|
||||
".no_increment",
|
||||
".increment",
|
||||
};
|
||||
|
||||
fprintf(fp, "SHARED_SB_INC%s%s #%u, #%u",
|
||||
progress_increment_name[I.progress_increment],
|
||||
defer_mode_str(I), I.sb_mask, I.shared_entry);
|
||||
fprintf(fp, "SHARED_SB_INC%s #%u, #%u", defer_mode_str(I), I.sb_mask,
|
||||
I.shared_entry);
|
||||
break;
|
||||
}
|
||||
|
||||
case MALI_CS_OPCODE_SHARED_SB_DEC: {
|
||||
cs_unpack(instr, CS_SHARED_SB_DEC, I);
|
||||
|
||||
const char *progress_increment_name[] = {
|
||||
".no_increment",
|
||||
".increment",
|
||||
};
|
||||
|
||||
fprintf(fp, "SHARED_SB_DEC%s #%u",
|
||||
progress_increment_name[I.progress_increment], I.shared_entry);
|
||||
fprintf(fp, "SHARED_SB_DEC #%u", I.shared_entry);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT2: {
|
||||
static const char *tile_order[] = {
|
||||
"zorder", "horizontal", "vertical", "unknown",
|
||||
"unknown", "rev_horizontal", "rev_vertical", "unknown",
|
||||
"unknown", "unknown", "unknown", "unknown",
|
||||
"unknown", "unknown", "unknown", "unknown",
|
||||
};
|
||||
|
||||
cs_unpack(instr, CS_RUN_FRAGMENT2, I);
|
||||
|
||||
fprintf(fp, "RUN_FRAGMENT2%s.tile_order=%s",
|
||||
I.enable_tem ? ".tile_enable_map_enable" : "",
|
||||
tile_order[I.tile_order]);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT: {
|
||||
static const char *tile_order[] = {
|
||||
"zorder", "horizontal", "vertical", "unknown",
|
||||
|
|
@ -350,27 +350,25 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
|
|||
"unknown", "unknown", "unknown", "unknown",
|
||||
"unknown", "unknown", "unknown", "unknown",
|
||||
};
|
||||
|
||||
cs_unpack(instr, CS_RUN_FRAGMENT, I);
|
||||
|
||||
fprintf(fp, "RUN_FRAGMENT%s%s.tile_order=%s",
|
||||
I.progress_increment ? ".progress_inc" : "",
|
||||
fprintf(fp, "RUN_FRAGMENT%s.tile_order=%s",
|
||||
I.enable_tem ? ".tile_enable_map_enable" : "",
|
||||
tile_order[I.tile_order]);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
case MALI_CS_OPCODE_RUN_FULLSCREEN: {
|
||||
cs_unpack(instr, CS_RUN_FULLSCREEN, I);
|
||||
fprintf(fp, "RUN_FULLSCREEN%s r%u, #%" PRIx64,
|
||||
I.progress_increment ? ".progress_inc" : "", I.dcd,
|
||||
I.flags_override);
|
||||
fprintf(fp, "RUN_FULLSCREEN r%u, #%" PRIx64, I.dcd, I.flags_override);
|
||||
break;
|
||||
}
|
||||
|
||||
case MALI_CS_OPCODE_FINISH_TILING: {
|
||||
cs_unpack(instr, CS_FINISH_TILING, I);
|
||||
fprintf(fp, "FINISH_TILING%s",
|
||||
I.progress_increment ? ".progress_inc" : "");
|
||||
fprintf(fp, "FINISH_TILING");
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
@ -443,12 +441,6 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
|
|||
break;
|
||||
}
|
||||
|
||||
case MALI_CS_OPCODE_PROGRESS_WAIT: {
|
||||
cs_unpack(instr, CS_PROGRESS_WAIT, I);
|
||||
fprintf(fp, "PROGRESS_WAIT d%u, #%u", I.source, I.queue);
|
||||
break;
|
||||
}
|
||||
|
||||
case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: {
|
||||
cs_unpack(instr, CS_SET_EXCEPTION_HANDLER, I);
|
||||
fprintf(fp, "SET_EXCEPTION_HANDLER d%u, r%u", I.address, I.length);
|
||||
|
|
@ -547,29 +539,16 @@ print_cs_instr(FILE *fp, const uint64_t *instr)
|
|||
break;
|
||||
}
|
||||
|
||||
case MALI_CS_OPCODE_PROGRESS_STORE: {
|
||||
cs_unpack(instr, CS_PROGRESS_STORE, I);
|
||||
fprintf(fp, "PROGRESS_STORE d%u", I.source);
|
||||
break;
|
||||
}
|
||||
|
||||
case MALI_CS_OPCODE_PROGRESS_LOAD: {
|
||||
cs_unpack(instr, CS_PROGRESS_LOAD, I);
|
||||
fprintf(fp, "PROGRESS_LOAD d%u", I.destination);
|
||||
break;
|
||||
}
|
||||
|
||||
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
|
||||
cs_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I);
|
||||
#if PAN_ARCH >= 12
|
||||
fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
|
||||
I.progress_increment ? ".progress_inc" : "", I.srt_select,
|
||||
I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task,
|
||||
I.ep_limit);
|
||||
fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u, #%u",
|
||||
I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
|
||||
I.workgroups_per_task, I.ep_limit);
|
||||
#else
|
||||
fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u",
|
||||
I.progress_increment ? ".progress_inc" : "", I.srt_select,
|
||||
I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task);
|
||||
fprintf(fp, "RUN_COMPUTE_INDIRECT.srt%d.spd%d.tsd%d.fau%d #%u",
|
||||
I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
|
||||
I.workgroups_per_task);
|
||||
#endif
|
||||
|
||||
break;
|
||||
|
|
@ -672,8 +651,19 @@ pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
|
|||
if (fau)
|
||||
GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
|
||||
|
||||
GENX(pandecode_shader)
|
||||
(ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
|
||||
uint64_t addr = cs_get_u64(qctx, reg_spd);
|
||||
#if PAN_ARCH >= 15
|
||||
const struct mali_shader_program_pointer_packed spp_packed = {
|
||||
.opaque[0] = addr & 0xFFFFFFFF,
|
||||
.opaque[1] = (addr >> 32) & 0xFFFFFFFF,
|
||||
};
|
||||
pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp)
|
||||
;
|
||||
DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp,
|
||||
"Shader Program Pointer (%" PRIx64 "):\n", addr);
|
||||
addr = spp.pointer;
|
||||
#endif
|
||||
GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id);
|
||||
|
||||
DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
|
||||
"Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
|
||||
|
|
@ -714,8 +704,19 @@ pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp,
|
|||
if (fau)
|
||||
GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
|
||||
|
||||
GENX(pandecode_shader)
|
||||
(ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
|
||||
uint64_t addr = cs_get_u64(qctx, reg_spd);
|
||||
#if PAN_ARCH >= 15
|
||||
const struct mali_shader_program_pointer_packed spp_packed = {
|
||||
.opaque[0] = addr & 0xFFFFFFFF,
|
||||
.opaque[1] = (addr >> 32) & 0xFFFFFFFF,
|
||||
};
|
||||
pan_unpack(&spp_packed, SHADER_PROGRAM_POINTER, spp)
|
||||
;
|
||||
DUMP_UNPACKED(ctx, SHADER_PROGRAM_POINTER, spp,
|
||||
"Shader Program Pointer (%" PRIx64 "):\n", addr);
|
||||
addr = spp.pointer;
|
||||
#endif
|
||||
GENX(pandecode_shader)(ctx, addr, "Shader", qctx->gpu_id);
|
||||
|
||||
DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
|
||||
"Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
|
||||
|
|
@ -1097,6 +1098,101 @@ pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
|
|||
}
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
static void
|
||||
pandecode_run_fragment2(struct pandecode_context *ctx, FILE *fp,
|
||||
struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT2 *I)
|
||||
{
|
||||
if (qctx->in_exception_handler)
|
||||
return;
|
||||
|
||||
ctx->indent++;
|
||||
|
||||
pandecode_log(ctx, "Iter trace ID0: %" PRIu32 "\n",
|
||||
cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID0));
|
||||
pandecode_log(ctx, "Iter trace ID1: %" PRIu32 "\n",
|
||||
cs_get_u32(qctx, MALI_FRAGMENT_SR_ITER_TRACE_ID1));
|
||||
pandecode_log(ctx, "TEM pointer: %" PRIx64 "\n",
|
||||
cs_get_u64(qctx, MALI_FRAGMENT_SR_TEM_POINTER));
|
||||
pandecode_log(ctx, "TEM row stride: %" PRIu32 "\n",
|
||||
cs_get_u32(qctx, MALI_FRAGMENT_SR_TEM_ROW_STRIDE));
|
||||
|
||||
for (unsigned i = 0; i < 11; ++i) {
|
||||
const unsigned reg = MALI_FRAGMENT_SR_IRD_BUFFER_POINTER_0 + (i * 2);
|
||||
pandecode_log(ctx, "IRD buffer pointer %u: %" PRIx64 "\n", i,
|
||||
cs_get_u64(qctx, reg));
|
||||
}
|
||||
|
||||
DUMP_CL(ctx, FRAGMENT_FLAGS_3, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_3],
|
||||
"Flags 3:\n");
|
||||
DUMP_CL(ctx, FRAGMENT_BOUNDING_BOX,
|
||||
&qctx->regs[MALI_FRAGMENT_SR_BOUNDING_BOX], "Bounding Box:\n");
|
||||
DUMP_CL(ctx, FRAME_SIZE, &qctx->regs[MALI_FRAGMENT_SR_FRAME_SIZE],
|
||||
"Frame size:\n");
|
||||
|
||||
pan_unpack((const struct mali_fragment_flags_0_packed *)&qctx
|
||||
->regs[MALI_FRAGMENT_SR_FLAGS_0],
|
||||
FRAGMENT_FLAGS_0, flags0_unpacked)
|
||||
;
|
||||
DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_0, flags0_unpacked, "Flags 0:\n");
|
||||
|
||||
pan_unpack((const struct mali_fragment_flags_1_packed *)&qctx
|
||||
->regs[MALI_FRAGMENT_SR_FLAGS_1],
|
||||
FRAGMENT_FLAGS_1, flags1_unpacked)
|
||||
;
|
||||
DUMP_UNPACKED(ctx, FRAGMENT_FLAGS_1, flags1_unpacked, "Flags 1:\n");
|
||||
|
||||
DUMP_CL(ctx, FRAGMENT_FLAGS_2, &qctx->regs[MALI_FRAGMENT_SR_FLAGS_2],
|
||||
"Flags 2:\n");
|
||||
pandecode_log(ctx, "Z clear: %f\n",
|
||||
uif(cs_get_u32(qctx, MALI_FRAGMENT_SR_Z_CLEAR)));
|
||||
|
||||
const uint64_t tiler_pointer =
|
||||
cs_get_u64(qctx, MALI_FRAGMENT_SR_TILER_DESCRIPTOR_POINTER);
|
||||
pandecode_log(ctx, "Tiler descriptor pointer: 0x%" PRIx64 "\n",
|
||||
tiler_pointer);
|
||||
|
||||
const uint64_t rtd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_RTD_POINTER);
|
||||
pandecode_log(ctx, "RTD pointer: 0x%" PRIx64 "\n", rtd_pointer);
|
||||
|
||||
const uint64_t dbd_pointer = cs_get_u64(qctx, MALI_FRAGMENT_SR_DBD_POINTER);
|
||||
pandecode_log(ctx, "DBD pointer: 0x%" PRIx64 "\n", dbd_pointer);
|
||||
|
||||
pandecode_log(ctx, "Frame argument: %" PRIx64 "\n",
|
||||
cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_ARG));
|
||||
|
||||
const uint64_t sample_locations =
|
||||
cs_get_u64(qctx, MALI_FRAGMENT_SR_SAMPLE_POSITION_ARRAY_POINTER);
|
||||
pandecode_log(ctx, "Sample locations: 0x%" PRIx64 "\n", sample_locations);
|
||||
|
||||
const uint64_t dcd_pointer =
|
||||
cs_get_u64(qctx, MALI_FRAGMENT_SR_FRAME_SHADER_DCD_POINTER);
|
||||
pandecode_log(ctx, "Frame shader DCD pointer: 0x%" PRIx64 "\n", dcd_pointer);
|
||||
|
||||
DUMP_CL(ctx, VRS_IMAGE, &qctx->regs[MALI_FRAGMENT_SR_VRS_IMAGE],
|
||||
"VRS image:\n");
|
||||
|
||||
GENX(pandecode_sample_locations)
|
||||
(ctx, sample_locations);
|
||||
|
||||
const unsigned job_type_param = 0;
|
||||
GENX(pandecode_frame_shader_dcds)
|
||||
(ctx, dcd_pointer, flags0_unpacked.pre_frame_0, flags0_unpacked.pre_frame_1,
|
||||
flags0_unpacked.post_frame, job_type_param, qctx->gpu_id);
|
||||
|
||||
if (tiler_pointer)
|
||||
GENX(pandecode_tiler)(ctx, tiler_pointer);
|
||||
|
||||
if (dbd_pointer)
|
||||
GENX(pandecode_zs_crc_ext)(ctx, dbd_pointer);
|
||||
|
||||
if (rtd_pointer)
|
||||
GENX(pandecode_rts)
|
||||
(ctx, rtd_pointer, flags1_unpacked.render_target_count);
|
||||
|
||||
ctx->indent--;
|
||||
}
|
||||
#else
|
||||
static void
|
||||
pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
|
||||
struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I)
|
||||
|
|
@ -1115,6 +1211,7 @@ pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
|
|||
|
||||
ctx->indent--;
|
||||
}
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
|
||||
static void
|
||||
pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp,
|
||||
|
|
@ -1261,11 +1358,19 @@ interpret_cs_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
|
|||
}
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT2: {
|
||||
cs_unpack(bytes, CS_RUN_FRAGMENT2, I);
|
||||
pandecode_run_fragment2(ctx, fp, qctx, &I);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT: {
|
||||
cs_unpack(bytes, CS_RUN_FRAGMENT, I);
|
||||
pandecode_run_fragment(ctx, fp, qctx, &I);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
case MALI_CS_OPCODE_RUN_FULLSCREEN: {
|
||||
cs_unpack(bytes, CS_RUN_FULLSCREEN, I);
|
||||
|
|
@ -2192,18 +2297,6 @@ collect_indirect_branch_targets_recurse(struct cs_code_cfg *cfg,
|
|||
break;
|
||||
}
|
||||
|
||||
case MALI_CS_OPCODE_PROGRESS_LOAD: {
|
||||
cs_unpack(instr, CS_PROGRESS_LOAD, I);
|
||||
for (unsigned i = 0; i < 16; i++) {
|
||||
if (BITSET_TEST(track_map, I.destination) ||
|
||||
BITSET_TEST(track_map, I.destination + 1)) {
|
||||
ibranch->has_unknown_targets = true;
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -2430,7 +2523,12 @@ print_cs_binary(struct pandecode_context *ctx, uint64_t bin,
|
|||
#else
|
||||
case MALI_CS_OPCODE_RUN_IDVS:
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT2:
|
||||
#else
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT:
|
||||
#endif
|
||||
case MALI_CS_OPCODE_RUN_FULLSCREEN:
|
||||
case MALI_CS_OPCODE_RUN_COMPUTE:
|
||||
case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
|
||||
|
|
@ -2539,6 +2637,19 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
|
|||
}
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT2: {
|
||||
struct cs_run_fragment2_trace *frag_trace = trace_data;
|
||||
|
||||
assert(trace_size >= sizeof(*frag_trace));
|
||||
cs_unpack(instr, CS_RUN_FRAGMENT2, I);
|
||||
memcpy(®s[0], frag_trace->sr, sizeof(frag_trace->sr));
|
||||
pandecode_run_fragment2(ctx, ctx->dump_stream, &qctx, &I);
|
||||
trace_data = frag_trace + 1;
|
||||
trace_size -= sizeof(*frag_trace);
|
||||
break;
|
||||
}
|
||||
#else
|
||||
case MALI_CS_OPCODE_RUN_FRAGMENT: {
|
||||
struct cs_run_fragment_trace *frag_trace = trace_data;
|
||||
|
||||
|
|
@ -2550,6 +2661,7 @@ GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
|
|||
trace_size -= sizeof(*frag_trace);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
case MALI_CS_OPCODE_RUN_FULLSCREEN: {
|
||||
struct cs_run_fullscreen_trace *fs_trace = trace_data;
|
||||
|
|
|
|||
|
|
@ -61,6 +61,12 @@
|
|||
#elif (PAN_ARCH == 13)
|
||||
#define GENX(X) X##_v13
|
||||
#include "genxml/v13_pack.h"
|
||||
#elif (PAN_ARCH == 14)
|
||||
#define GENX(X) X##_v14
|
||||
#include "genxml/v14_pack.h"
|
||||
#elif (PAN_ARCH == 15)
|
||||
#define GENX(X) X##_v15
|
||||
#include "genxml/v15_pack.h"
|
||||
#else
|
||||
#error "Need to add suffixing macro for this architecture"
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -83,23 +83,34 @@ def parse_modifier(modifier):
|
|||
if modifier is None:
|
||||
return None
|
||||
|
||||
for mod in MODIFIERS:
|
||||
if modifier[0:len(mod)] == mod:
|
||||
if mod == "log2":
|
||||
assert(len(mod) == len(modifier))
|
||||
return [mod]
|
||||
ret = []
|
||||
split_modifiers = modifier.split()
|
||||
|
||||
if modifier[len(mod)] == '(' and modifier[-1] == ')':
|
||||
ret = [mod, int(modifier[(len(mod) + 1):-1])]
|
||||
if ret[0] == 'align':
|
||||
align = ret[1]
|
||||
# Make sure the alignment is a power of 2
|
||||
assert(align > 0 and not(align & (align - 1)));
|
||||
for mod in split_modifiers:
|
||||
valid = False
|
||||
for valid_mod in MODIFIERS:
|
||||
if mod[0:len(valid_mod)] == valid_mod:
|
||||
if valid_mod == "log2":
|
||||
assert(len(valid_mod) == len(modifier))
|
||||
# Add a number to simplify parsing
|
||||
ret.extend([valid_mod, 0])
|
||||
valid = True
|
||||
break
|
||||
|
||||
return ret
|
||||
if mod[len(valid_mod)] == '(' and mod[-1] == ')':
|
||||
mod_arg = [valid_mod, int(mod[(len(valid_mod) + 1):-1])]
|
||||
if mod_arg[0] == 'align':
|
||||
align = mod_arg[1]
|
||||
# Make sure the alignment is a power of 2
|
||||
assert(align > 0 and not(align & (align - 1)));
|
||||
|
||||
print("Invalid modifier")
|
||||
assert(False)
|
||||
ret.extend(mod_arg)
|
||||
valid = True
|
||||
break
|
||||
|
||||
assert valid, f"Invalid modifier: {modifier}"
|
||||
|
||||
return ret
|
||||
|
||||
class Aggregate(object):
|
||||
def __init__(self, parser, name, attrs):
|
||||
|
|
@ -169,7 +180,7 @@ class Field(object):
|
|||
if self.type in self.parser.enums and self.default is not None:
|
||||
self.default = safe_name('{}_{}_{}'.format(global_prefix, self.type, self.default)).upper()
|
||||
|
||||
self.modifier = parse_modifier(attrs.get("modifier"))
|
||||
self.modifier = parse_modifier(attrs.get("modifier"))
|
||||
|
||||
def emit_template_struct(self, dim):
|
||||
if self.type == 'address':
|
||||
|
|
@ -291,14 +302,22 @@ class Group(object):
|
|||
if field.modifier is None:
|
||||
continue
|
||||
|
||||
if field.modifier[0] == "shr":
|
||||
shift = field.modifier[1]
|
||||
mask = hex((1 << shift) - 1)
|
||||
print(" assert(((__unpacked)->{} & {}) == 0); \\".format(field.name, mask))
|
||||
elif field.modifier[0] == "minus":
|
||||
print(" assert((__unpacked)->{} >= {}); \\".format(field.name, field.modifier[1]))
|
||||
elif field.modifier[0] == "log2":
|
||||
print(" assert(IS_POT_NONZERO((__unpacked)->{})); \\".format(field.name))
|
||||
value = "(__unpacked)->{}".format(field.name)
|
||||
for mod, mod_val in zip (field.modifier[::2], field.modifier[1::2]):
|
||||
if mod == "shr":
|
||||
mask = hex((1 << mod_val) - 1)
|
||||
print(" assert(({} & {}) == 0); \\".format(value, mask))
|
||||
value = "({} >> {})".format(value, mod_val)
|
||||
elif mod == "minus":
|
||||
print(" assert({} >= {}); \\".format(value, mod_val))
|
||||
value = "({} - {})".format(value, mod_val)
|
||||
elif mod == "align":
|
||||
mask = hex(mod_val - 1)
|
||||
print(' assert(!({} & {})); \\'.format(value, mask))
|
||||
value = "(ALIGN_POT({}, {}))".format(value, mod_val)
|
||||
elif mod == "log2":
|
||||
print(" assert(IS_POT_NONZERO({})); \\".format(value))
|
||||
value = "(util_logbase2({}))".format(value)
|
||||
|
||||
for index in range(self.length // 4):
|
||||
# Handle MBZ words
|
||||
|
|
@ -324,14 +343,15 @@ class Group(object):
|
|||
|
||||
value = "(__unpacked)->{}".format(contributor.path)
|
||||
if field.modifier is not None:
|
||||
if field.modifier[0] == "shr":
|
||||
value = "{} >> {}".format(value, field.modifier[1])
|
||||
elif field.modifier[0] == "minus":
|
||||
value = "{} - {}".format(value, field.modifier[1])
|
||||
elif field.modifier[0] == "align":
|
||||
value = "ALIGN_POT({}, {})".format(value, field.modifier[1])
|
||||
elif field.modifier[0] == "log2":
|
||||
value = "util_logbase2({})".format(value)
|
||||
for mod, mod_val in zip(field.modifier[::2], field.modifier[1::2]):
|
||||
if mod == "shr":
|
||||
value = "({} >> {})".format(value, mod_val)
|
||||
elif mod == "minus":
|
||||
value = "({} - {})".format(value, mod_val)
|
||||
elif mod == "align":
|
||||
value = "(ALIGN_POT({}, {}))".format(value, mod_val)
|
||||
elif mod == "log2":
|
||||
value = "(util_logbase2({}))".format(value)
|
||||
|
||||
if field.type in ["uint", "hex", "uint/float", "address", "Pixel Format", "Component Swizzle"]:
|
||||
s = "util_bitpack_uint(%s, %d, %d)" % \
|
||||
|
|
@ -435,25 +455,24 @@ class Group(object):
|
|||
else:
|
||||
s = "/* unhandled field %s, type %s */\n" % (field.name, field.type)
|
||||
|
||||
suffix = ""
|
||||
prefix = ""
|
||||
if field.modifier:
|
||||
if field.modifier[0] == "minus":
|
||||
suffix = " + {}".format(field.modifier[1])
|
||||
elif field.modifier[0] == "shr":
|
||||
suffix = " << {}".format(field.modifier[1])
|
||||
if field.modifier[0] == "log2":
|
||||
prefix = "1U << "
|
||||
|
||||
print(' {}({}); \\'.format(convert, ', '.join(args)))
|
||||
|
||||
if len(prefix) != 0 or len(suffix) != 0:
|
||||
print(' (__unpacked)->{} = {}(__unpacked)->{}{}; \\'.format(fieldref.path, prefix, fieldref.path, suffix))
|
||||
value = "(__unpacked)->{}".format(fieldref.path)
|
||||
if field.modifier is not None:
|
||||
# Need to reverse ([::-1]) modifier order when unpacking
|
||||
for mod, mod_val in list(zip(field.modifier[::2], field.modifier[1::2]))[::-1]:
|
||||
if mod == "shr":
|
||||
value = "({} << {})".format(value, mod_val)
|
||||
elif mod == "minus":
|
||||
value = "({} + {})".format(value, mod_val)
|
||||
elif mod == "align":
|
||||
mask = hex(mod_val - 1)
|
||||
print(' assert(!({} & {})); \\'.format(value, mask))
|
||||
elif mod == "log2":
|
||||
value = "(1U << {})".format(value)
|
||||
|
||||
|
||||
if field.modifier and field.modifier[0] == "align":
|
||||
mask = hex(field.modifier[1] - 1)
|
||||
print(' assert(!((__unpacked)->{} & {})); \\'.format(fieldref.path, mask))
|
||||
print(' (__unpacked)->{} = {}; \\'.format(fieldref.path, value))
|
||||
|
||||
def emit_print_function(self):
|
||||
for field in self.fields:
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
|
||||
pan_packers = []
|
||||
foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13']
|
||||
foreach packer : ['common', 'v4', 'v5', 'v6', 'v7', 'v9', 'v10', 'v12', 'v13', 'v14', 'v15']
|
||||
pan_packers += custom_target(
|
||||
packer + '_pack.h',
|
||||
input : ['gen_pack.py', packer + '.xml'],
|
||||
|
|
@ -20,7 +20,7 @@ idep_pan_packers = declare_dependency(
|
|||
|
||||
libpanfrost_decode_per_arch = []
|
||||
|
||||
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13']
|
||||
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
|
||||
libpanfrost_decode_per_arch += static_library(
|
||||
'pandecode-arch-v' + ver,
|
||||
['decode.c', 'decode_jm.c', 'decode_csf.c', pan_packers],
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
<!--
|
||||
Copyright (C) 2020 Collabora Ltd.
|
||||
Copyright (C) 2026 Arm Ltd.
|
||||
SPDX-License-Identifier: MIT
|
||||
-->
|
||||
|
||||
|
|
@ -84,6 +85,7 @@
|
|||
<enum name="Address Mode">
|
||||
<value name="Flat" value="0"/>
|
||||
<value name="Packed" value="1"/>
|
||||
<value name="Out of bounds" value="8"/>
|
||||
</enum>
|
||||
|
||||
<enum name="Format">
|
||||
|
|
@ -132,6 +134,7 @@
|
|||
<value name="A2 YUV10" value="41"/>
|
||||
<value name="YUYAAYVYAA" value="42"/>
|
||||
<!--- TODO: revisit YUV -->
|
||||
<value name="Y10U10V10_420" value="43"/>
|
||||
<value name="YUYV10" value="44"/>
|
||||
<value name="VYUY10" value="45"/>
|
||||
<value name="Y10 UV10 422" value="46"/>
|
||||
|
|
@ -1163,6 +1166,13 @@
|
|||
<enum name="Clump Ordering">
|
||||
<value name="Tiled U-Interleaved" value="1"/>
|
||||
<value name="Linear" value="2"/>
|
||||
|
||||
<!-- Block-linear interleaved clump orderings are not available on
|
||||
all v10 architectures. -->
|
||||
<value name="Block-linear interleaved 16x16" value="3"/>
|
||||
<value name="Block-linear interleaved 8x16" value="4"/>
|
||||
<value name="Block-linear interleaved 8x8" value="5"/>
|
||||
|
||||
<value name="Interleaved 64k" value="8"/>
|
||||
</enum>
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
<!--
|
||||
Copyright (C) 2025 Collabora Ltd.
|
||||
Copyright (C) 2026 Arm Ltd.
|
||||
SPDX-License-Identifier: MIT
|
||||
-->
|
||||
|
||||
|
|
@ -84,6 +85,7 @@
|
|||
<enum name="Address Mode">
|
||||
<value name="Flat" value="0"/>
|
||||
<value name="Packed" value="1"/>
|
||||
<value name="Out of bounds" value="8"/>
|
||||
</enum>
|
||||
|
||||
<enum name="Format">
|
||||
|
|
@ -132,6 +134,7 @@
|
|||
<value name="A2 YUV10" value="41"/>
|
||||
<value name="YUYAAYVYAA" value="42"/>
|
||||
<!--- TODO: revisit YUV -->
|
||||
<value name="Y10U10V10_420" value="43"/>
|
||||
<value name="YUYV10" value="44"/>
|
||||
<value name="VYUY10" value="45"/>
|
||||
<value name="Y10 UV10 422" value="46"/>
|
||||
|
|
@ -1426,6 +1429,9 @@
|
|||
<enum name="Clump Ordering">
|
||||
<value name="Tiled U-Interleaved" value="1"/>
|
||||
<value name="Linear" value="2"/>
|
||||
<value name="Block-linear interleaved 16x16" value="3"/>
|
||||
<value name="Block-linear interleaved 8x16" value="4"/>
|
||||
<value name="Block-linear interleaved 8x8" value="5"/>
|
||||
<value name="Interleaved 64k" value="8"/>
|
||||
</enum>
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
<!--
|
||||
Copyright (C) 2025 Collabora Ltd.
|
||||
Copyright (C) 2026 Arm Ltd.
|
||||
SPDX-License-Identifier: MIT
|
||||
-->
|
||||
|
||||
|
|
@ -84,6 +85,7 @@
|
|||
<enum name="Address Mode">
|
||||
<value name="Flat" value="0"/>
|
||||
<value name="Packed" value="1"/>
|
||||
<value name="Out of bounds" value="8"/>
|
||||
</enum>
|
||||
|
||||
<enum name="Format">
|
||||
|
|
@ -132,6 +134,7 @@
|
|||
<value name="A2 YUV10" value="41"/>
|
||||
<value name="YUYAAYVYAA" value="42"/>
|
||||
<!--- TODO: revisit YUV -->
|
||||
<value name="Y10U10V10_420" value="43"/>
|
||||
<value name="YUYV10" value="44"/>
|
||||
<value name="VYUY10" value="45"/>
|
||||
<value name="Y10 UV10 422" value="46"/>
|
||||
|
|
@ -1728,6 +1731,9 @@
|
|||
<enum name="Clump Ordering">
|
||||
<value name="Tiled U-Interleaved" value="1"/>
|
||||
<value name="Linear" value="2"/>
|
||||
<value name="Block-linear interleaved 16x16" value="3"/>
|
||||
<value name="Block-linear interleaved 8x16" value="4"/>
|
||||
<value name="Block-linear interleaved 8x8" value="5"/>
|
||||
<value name="Interleaved 64k" value="8"/>
|
||||
</enum>
|
||||
|
||||
|
|
|
|||
2753
src/panfrost/genxml/v14.xml
Normal file
2753
src/panfrost/genxml/v14.xml
Normal file
File diff suppressed because it is too large
Load diff
2759
src/panfrost/genxml/v15.xml
Normal file
2759
src/panfrost/genxml/v15.xml
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,5 +1,6 @@
|
|||
<!--
|
||||
Copyright (C) 2020 Collabora Ltd.
|
||||
Copyright (C) 2026 Arm Ltd.
|
||||
SPDX-License-Identifier: MIT
|
||||
-->
|
||||
|
||||
|
|
@ -103,6 +104,7 @@
|
|||
<enum name="Address Mode">
|
||||
<value name="Flat" value="0"/>
|
||||
<value name="Packed" value="1"/>
|
||||
<value name="Out of bounds" value="8"/>
|
||||
</enum>
|
||||
|
||||
<enum name="Format">
|
||||
|
|
|
|||
|
|
@ -206,6 +206,9 @@ struct pan_kmod_dev_props {
|
|||
/* Maximum number of threads per workgroup. */
|
||||
uint32_t max_threads_per_wg;
|
||||
|
||||
/* Granularity of number of active threads. */
|
||||
uint32_t num_threads_active_granularity;
|
||||
|
||||
/* Number of registers per core. Can be used to determine the maximum
|
||||
* number of threads that can be allocated for a specific shader based on
|
||||
* the number of registers assigned to this shader.
|
||||
|
|
|
|||
|
|
@ -133,13 +133,17 @@ panthor_dev_query_thread_props(struct panthor_kmod_dev *panthor_dev)
|
|||
props->max_tasks_per_core = panthor_dev->props.gpu.thread_features >> 24;
|
||||
props->num_registers_per_core =
|
||||
panthor_dev->props.gpu.thread_features & 0x3fffff;
|
||||
props->num_threads_active_granularity =
|
||||
panthor_dev->props.gpu.thread_num_active_granularity;
|
||||
|
||||
/* We assume that all thread properties are populated. If we ever have a GPU
|
||||
* that have one of the THREAD_xxx register that's zero, we can always add a
|
||||
* quirk here.
|
||||
*/
|
||||
assert(props->max_threads_per_wg && props->max_threads_per_core &&
|
||||
props->max_tasks_per_core && props->num_registers_per_core);
|
||||
assert(
|
||||
(props->max_threads_per_wg || props->num_threads_active_granularity) &&
|
||||
props->max_threads_per_core && props->max_tasks_per_core &&
|
||||
props->num_registers_per_core);
|
||||
|
||||
/* There is no THREAD_TLS_ALLOC register on v10+, and the maximum number
|
||||
* of TLS instance per core is assumed to be the maximum number of threads
|
||||
|
|
@ -153,8 +157,12 @@ panthor_dev_query_props(struct panthor_kmod_dev *panthor_dev)
|
|||
{
|
||||
struct pan_kmod_dev_props *props = &panthor_dev->base.props;
|
||||
|
||||
bool is_gpu_wide = panthor_dev->props.gpu.gpu_id == 0;
|
||||
assert(!is_gpu_wide || panthor_dev->props.gpu.gpu_wide_id);
|
||||
|
||||
*props = (struct pan_kmod_dev_props){
|
||||
.gpu_id = panthor_dev->props.gpu.gpu_id,
|
||||
.gpu_id = is_gpu_wide ? panthor_dev->props.gpu.gpu_wide_id
|
||||
: panthor_dev->props.gpu.gpu_id,
|
||||
.gpu_variant = panthor_dev->props.gpu.core_features & 0xff,
|
||||
.shader_present = panthor_dev->props.gpu.shader_present,
|
||||
.tiler_features = panthor_dev->props.gpu.tiler_features,
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
subdir('kmod')
|
||||
|
||||
pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13']
|
||||
pixel_format_versions = ['5', '6', '7', '9', '10', '12', '13', '14', '15']
|
||||
libpanfrost_pixel_format = []
|
||||
|
||||
deps_for_libpanfrost = [dep_libdrm, idep_pan_packers, idep_mesautil, libpanfrost_model_dep]
|
||||
|
|
@ -22,7 +22,7 @@ endforeach
|
|||
|
||||
libpanfrost_per_arch = []
|
||||
|
||||
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13']
|
||||
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
|
||||
libpanfrost_per_arch += static_library(
|
||||
'pan-arch-v' + ver,
|
||||
[
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
* Copyright (C) 2014 Broadcom
|
||||
* Copyright (C) 2018-2019 Alyssa Rosenzweig
|
||||
* Copyright (C) 2019-2020 Collabora, Ltd.
|
||||
* Copyright (C) 2026 Arm Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
|
|
@ -711,6 +712,32 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode)
|
|||
case PAN_AFBC_MODE_R16G16B16A16:
|
||||
return MALI_AFBC_COMPRESSION_MODE_R16G16B16A16;
|
||||
#endif
|
||||
#if PAN_ARCH >= 14
|
||||
case PAN_AFBC_MODE_YUV420_6C8:
|
||||
return MALI_AFBC_COMPRESSION_MODE_Y8U8V8_420;
|
||||
case PAN_AFBC_MODE_YUV420_2C8:
|
||||
return MALI_AFBC_COMPRESSION_MODE_R8G8;
|
||||
case PAN_AFBC_MODE_YUV420_1C8:
|
||||
return MALI_AFBC_COMPRESSION_MODE_R8;
|
||||
case PAN_AFBC_MODE_YUV420_6C10:
|
||||
return MALI_AFBC_COMPRESSION_MODE_Y10U10V10_420;
|
||||
case PAN_AFBC_MODE_YUV420_2C10:
|
||||
return MALI_AFBC_COMPRESSION_MODE_R10G10;
|
||||
case PAN_AFBC_MODE_YUV420_1C10:
|
||||
return MALI_AFBC_COMPRESSION_MODE_R10;
|
||||
case PAN_AFBC_MODE_YUV422_4C8:
|
||||
return MALI_AFBC_COMPRESSION_MODE_Y8U8Y8V8_422;
|
||||
case PAN_AFBC_MODE_YUV422_2C8:
|
||||
return MALI_AFBC_COMPRESSION_MODE_R8G8;
|
||||
case PAN_AFBC_MODE_YUV422_1C8:
|
||||
return MALI_AFBC_COMPRESSION_MODE_R8;
|
||||
case PAN_AFBC_MODE_YUV422_4C10:
|
||||
return MALI_AFBC_COMPRESSION_MODE_Y10U10Y10V10_422;
|
||||
case PAN_AFBC_MODE_YUV422_2C10:
|
||||
return MALI_AFBC_COMPRESSION_MODE_R10G10;
|
||||
case PAN_AFBC_MODE_YUV422_1C10:
|
||||
return MALI_AFBC_COMPRESSION_MODE_R10;
|
||||
#else
|
||||
case PAN_AFBC_MODE_YUV420_6C8:
|
||||
return MALI_AFBC_COMPRESSION_MODE_YUV420_6C8;
|
||||
case PAN_AFBC_MODE_YUV420_2C8:
|
||||
|
|
@ -735,6 +762,7 @@ pan_afbc_compression_mode(enum pan_afbc_mode mode)
|
|||
return MALI_AFBC_COMPRESSION_MODE_YUV422_2C10;
|
||||
case PAN_AFBC_MODE_YUV422_1C10:
|
||||
return MALI_AFBC_COMPRESSION_MODE_YUV422_1C10;
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
#if PAN_ARCH == 9
|
||||
case PAN_AFBC_MODE_R16:
|
||||
case PAN_AFBC_MODE_R16G16:
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2023 Collabora, Ltd.
|
||||
* Copyright (C) 2026 Arm Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
|
|
@ -347,6 +348,25 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier,
|
|||
return (scan ? MALI_AFRC_FORMAT_R10G10B10A10_SCAN
|
||||
: MALI_AFRC_FORMAT_R10G10B10A10_ROT);
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
case PAN_AFRC_ICHANGE_FORMAT_YUV444:
|
||||
case PAN_AFRC_ICHANGE_FORMAT_YUV422:
|
||||
case PAN_AFRC_ICHANGE_FORMAT_YUV420:
|
||||
if (info.bpc == 8) {
|
||||
if (plane == 0 || info.num_planes == 3)
|
||||
return (scan ? MALI_AFRC_FORMAT_R8_SCAN : MALI_AFRC_FORMAT_R8_ROT);
|
||||
|
||||
return (scan ? MALI_AFRC_FORMAT_R8G8_SCAN : MALI_AFRC_FORMAT_R8G8_ROT);
|
||||
}
|
||||
|
||||
if (plane == 0 || info.num_planes == 3)
|
||||
return (scan ? MALI_AFRC_FORMAT_R10_SCAN : MALI_AFRC_FORMAT_R10_ROT);
|
||||
|
||||
assert(info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV422 ||
|
||||
info.ichange_fmt == PAN_AFRC_ICHANGE_FORMAT_YUV420);
|
||||
return (scan ? MALI_AFRC_FORMAT_R10G10_SCAN
|
||||
: MALI_AFRC_FORMAT_R10G10_ROT);
|
||||
#else
|
||||
case PAN_AFRC_ICHANGE_FORMAT_YUV444:
|
||||
if (info.bpc == 8) {
|
||||
if (plane == 0 || info.num_planes == 3)
|
||||
|
|
@ -394,6 +414,7 @@ pan_afrc_format(struct pan_afrc_format_info info, uint64_t modifier,
|
|||
|
||||
return (scan ? MALI_AFRC_FORMAT_R10G10_420_SCAN
|
||||
: MALI_AFRC_FORMAT_R10G10_420_ROT);
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
|
||||
default:
|
||||
return MALI_AFRC_FORMAT_INVALID;
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2021 Collabora, Ltd.
|
||||
* Copyright (C) 2026 Arm Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
|
|
@ -11,6 +12,7 @@
|
|||
#include "pan_afrc.h"
|
||||
#include "pan_desc.h"
|
||||
#include "pan_encoder.h"
|
||||
#include "pan_fb.h"
|
||||
#include "pan_props.h"
|
||||
#include "pan_texture.h"
|
||||
#include "pan_trace.h"
|
||||
|
|
@ -1172,11 +1174,156 @@ check_fb_attachments(const struct pan_fb_info *fb)
|
|||
#endif
|
||||
}
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
unsigned
|
||||
GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
|
||||
const struct pan_tls_info *tls,
|
||||
const struct pan_tiler_context *tiler_ctx, void *out)
|
||||
const struct pan_tiler_context *tiler_ctx,
|
||||
const struct pan_ptr framebuffer)
|
||||
{
|
||||
void *out = framebuffer.cpu;
|
||||
|
||||
PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC);
|
||||
|
||||
check_fb_attachments(fb);
|
||||
|
||||
const int crc_rt = GENX(pan_select_crc_rt)(fb, fb->tile_size);
|
||||
const bool has_zs_crc_ext = (fb->zs.view.zs || fb->zs.view.s || crc_rt >= 0);
|
||||
const struct pan_clean_tile clean_tile = pan_get_clean_tile_info(fb);
|
||||
|
||||
/* Emit to memory the state that might change per-layer. The static
|
||||
* state is emitted directly to CSF registers by
|
||||
* cs_emit_static_fragment_state().
|
||||
*/
|
||||
|
||||
struct pan_fbd_layer fbd_data = {0};
|
||||
fbd_data.tiler = tiler_ctx->valhall.desc;
|
||||
|
||||
/* internal_layer_index in flags0 is used to select the right
|
||||
* primitive list in the tiler context, and frame_arg is the value
|
||||
* that's passed to the fragment shader through r62-r63, which we use
|
||||
* to pass gl_Layer. Since the layer_idx only takes 8-bits, we might
|
||||
* use the extra 56-bits we have in frame_argument to pass other
|
||||
* information to the fragment shader at some point.
|
||||
*/
|
||||
assert(layer_idx >= tiler_ctx->valhall.layer_offset);
|
||||
fbd_data.frame_argument = layer_idx;
|
||||
|
||||
pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) {
|
||||
cfg.pre_frame_0 =
|
||||
pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[0],
|
||||
pan_clean_tile_write_any_set(clean_tile));
|
||||
cfg.pre_frame_1 =
|
||||
pan_fix_frame_shader_mode(fb->bifrost.pre_post.modes[1],
|
||||
pan_clean_tile_write_any_set(clean_tile));
|
||||
cfg.post_frame = fb->bifrost.pre_post.modes[2];
|
||||
|
||||
const unsigned zs_bytes_per_pixel = pan_zsbuf_bytes_per_pixel(fb);
|
||||
/* We can interleave HSR if we have space for two ZS tiles in
|
||||
* the tile buffer. */
|
||||
const unsigned max_zs_tile_size_interleave =
|
||||
fb->z_tile_buf_budget >> util_logbase2_ceil(zs_bytes_per_pixel);
|
||||
const bool hsr_can_interleave =
|
||||
fb->tile_size <= max_zs_tile_size_interleave;
|
||||
|
||||
/* Enabling prepass without interleave is generally not good for
|
||||
* performance, so disable HSR in that case. */
|
||||
cfg.hsr_prepass_enable = fb->allow_hsr_prepass && hsr_can_interleave;
|
||||
cfg.hsr_prepass_interleaving_enable = hsr_can_interleave;
|
||||
cfg.hsr_prepass_filter_enable = true;
|
||||
cfg.hsr_hierarchical_optimizations_enable = true;
|
||||
|
||||
cfg.internal_layer_index = layer_idx - tiler_ctx->valhall.layer_offset;
|
||||
}
|
||||
|
||||
fbd_data.dcd_pointer = fb->bifrost.pre_post.dcds.gpu;
|
||||
|
||||
pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) {
|
||||
cfg.s_clear = fb->zs.clear_value.stencil;
|
||||
cfg.s_write_enable = (fb->zs.view.s && !fb->zs.discard.s);
|
||||
|
||||
/* Default to 24 bit depth if there's no surface. */
|
||||
cfg.z_internal_format =
|
||||
fb->zs.view.zs ? pan_get_z_internal_format(fb->zs.view.zs->format)
|
||||
: MALI_Z_INTERNAL_FORMAT_D24;
|
||||
cfg.z_write_enable = (fb->zs.view.zs && !fb->zs.discard.z);
|
||||
|
||||
if (crc_rt >= 0) {
|
||||
bool *valid = fb->rts[crc_rt].crc_valid;
|
||||
bool full = !fb->draw_extent.minx && !fb->draw_extent.miny &&
|
||||
fb->draw_extent.maxx == (fb->width - 1) &&
|
||||
fb->draw_extent.maxy == (fb->height - 1);
|
||||
|
||||
/* If the CRC was valid it stays valid, if it wasn't, we must
|
||||
* ensure the render operation covers the full frame, and
|
||||
* clean tiles are pushed to memory. */
|
||||
bool new_valid = *valid | (full && pan_clean_tile_write_rt_enabled(
|
||||
clean_tile, crc_rt));
|
||||
|
||||
cfg.crc_read_enable = *valid;
|
||||
|
||||
/* If the data is currently invalid, still write CRC
|
||||
* data if we are doing a full write, so that it is
|
||||
* valid for next time. */
|
||||
cfg.crc_write_enable = new_valid;
|
||||
|
||||
*valid = new_valid;
|
||||
}
|
||||
}
|
||||
|
||||
fbd_data.z_clear = util_bitpack_float(fb->zs.clear_value.depth);
|
||||
|
||||
{
|
||||
/* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */
|
||||
uint64_t out_gpu_addr =
|
||||
framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
|
||||
|
||||
if (has_zs_crc_ext) {
|
||||
fbd_data.dbd_pointer = out_gpu_addr;
|
||||
assert(fbd_data.dbd_pointer % 64 == 0);
|
||||
out_gpu_addr += pan_size(ZS_CRC_EXTENSION);
|
||||
}
|
||||
|
||||
fbd_data.rtd_pointer = out_gpu_addr;
|
||||
assert(fbd_data.rtd_pointer % 64 == 0);
|
||||
}
|
||||
|
||||
memcpy(out, &fbd_data, sizeof(fbd_data));
|
||||
out += ALIGN_POT(sizeof(fbd_data), 64);
|
||||
|
||||
if (has_zs_crc_ext) {
|
||||
struct mali_zs_crc_extension_packed *zs_crc_ext = out;
|
||||
pan_emit_zs_crc_ext(fb, layer_idx, crc_rt, zs_crc_ext, clean_tile);
|
||||
out += pan_size(ZS_CRC_EXTENSION);
|
||||
}
|
||||
|
||||
const unsigned rt_count = MAX2(fb->rt_count, 1);
|
||||
unsigned cbuf_offset = 0;
|
||||
for (unsigned i = 0; i < rt_count; i++) {
|
||||
pan_emit_rt(fb, layer_idx, i, cbuf_offset, out, clean_tile);
|
||||
out += pan_size(RENDER_TARGET);
|
||||
if (!fb->rts[i].view)
|
||||
continue;
|
||||
|
||||
cbuf_offset += pan_bytes_per_pixel_tib(fb->rts[i].view->format) *
|
||||
fb->tile_size *
|
||||
pan_image_view_get_nr_samples(fb->rts[i].view);
|
||||
|
||||
if (i != crc_rt && fb->rts[i].crc_valid != NULL)
|
||||
*(fb->rts[i].crc_valid) = false;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
unsigned
|
||||
GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
|
||||
const struct pan_tls_info *tls,
|
||||
const struct pan_tiler_context *tiler_ctx,
|
||||
const struct pan_ptr framebuffer)
|
||||
{
|
||||
void *out = framebuffer.cpu;
|
||||
|
||||
PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC);
|
||||
|
||||
check_fb_attachments(fb);
|
||||
|
|
@ -1351,6 +1498,7 @@ GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
|
|||
}
|
||||
return tag.opaque[0];
|
||||
}
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
#else /* PAN_ARCH == 4 */
|
||||
static enum mali_color_format
|
||||
pan_sfbd_raw_format(unsigned bits)
|
||||
|
|
@ -1378,8 +1526,11 @@ GENX(pan_select_tile_size)(struct pan_fb_info *fb)
|
|||
unsigned
|
||||
GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
|
||||
const struct pan_tls_info *tls,
|
||||
const struct pan_tiler_context *tiler_ctx, void *fbd)
|
||||
const struct pan_tiler_context *tiler_ctx,
|
||||
const struct pan_ptr framebuffer)
|
||||
{
|
||||
void *fbd = framebuffer.cpu;
|
||||
|
||||
PAN_TRACE_FUNC(PAN_TRACE_LIB_DESC);
|
||||
|
||||
assert(fb->rt_count <= 1);
|
||||
|
|
|
|||
|
|
@ -196,18 +196,22 @@ pan_wls_adjust_size(unsigned wls_size)
|
|||
|
||||
static inline unsigned
|
||||
pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
|
||||
const struct pan_kmod_dev_props *props)
|
||||
const struct pan_kmod_dev_props *props,
|
||||
unsigned work_reg_count)
|
||||
{
|
||||
/* Each shader core can run N tasks and a total of M threads at any single
|
||||
* time, thus each task should ideally have no more than M/N threads. */
|
||||
unsigned max_threads_per_task =
|
||||
props->max_threads_per_core / props->max_tasks_per_core;
|
||||
|
||||
ASSERTED unsigned max_threads_per_wg =
|
||||
pan_compute_max_thread_count(props, work_reg_count);
|
||||
|
||||
/* To achieve the best utilization, we should aim for as many workgroups
|
||||
* per tasks as we can fit without exceeding the above thread limit */
|
||||
unsigned threads_per_wg =
|
||||
shader_local_size->x * shader_local_size->y * shader_local_size->z;
|
||||
assert(threads_per_wg > 0 && threads_per_wg <= props->max_threads_per_wg);
|
||||
assert(threads_per_wg > 0 && threads_per_wg <= max_threads_per_wg);
|
||||
unsigned wg_per_task = DIV_ROUND_UP(max_threads_per_task, threads_per_wg);
|
||||
assert(wg_per_task > 0 && wg_per_task <= max_threads_per_task);
|
||||
|
||||
|
|
@ -217,14 +221,15 @@ pan_calc_workgroups_per_task(const struct pan_compute_dim *shader_local_size,
|
|||
static inline unsigned
|
||||
pan_calc_wls_instances(const struct pan_compute_dim *shader_local_size,
|
||||
const struct pan_kmod_dev_props *props,
|
||||
const struct pan_compute_dim *dim)
|
||||
const struct pan_compute_dim *dim,
|
||||
unsigned work_reg_count)
|
||||
{
|
||||
/* NOTE: If the instance count is lower than the number of workgroups
|
||||
* being dispatched, the HW will hold back workgroups until instances
|
||||
* can be reused. */
|
||||
unsigned instances;
|
||||
unsigned wg_per_task =
|
||||
pan_calc_workgroups_per_task(shader_local_size, props);
|
||||
pan_calc_workgroups_per_task(shader_local_size, props, work_reg_count);
|
||||
unsigned max_instances_per_core =
|
||||
util_next_power_of_two(wg_per_task * props->max_tasks_per_core);
|
||||
|
||||
|
|
@ -341,7 +346,7 @@ void GENX(pan_emit_afrc_color_attachment)(const struct pan_attachment_info *att,
|
|||
unsigned GENX(pan_emit_fbd)(const struct pan_fb_info *fb, unsigned layer_idx,
|
||||
const struct pan_tls_info *tls,
|
||||
const struct pan_tiler_context *tiler_ctx,
|
||||
void *out);
|
||||
const struct pan_ptr framebuffer);
|
||||
|
||||
#if PAN_ARCH >= 6
|
||||
unsigned GENX(pan_select_tiler_hierarchy_mask)(uint32_t width, uint32_t height,
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2026 Collabora, Ltd.
|
||||
* Copyright (C) 2026 Arm Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
#include "pan_fb.h"
|
||||
|
|
@ -669,9 +670,124 @@ pan_fix_frame_shader_mode(enum mali_pre_post_frame_shader_mode mode,
|
|||
}
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
uint32_t
|
||||
GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out)
|
||||
GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info,
|
||||
const struct pan_ptr framebuffer)
|
||||
{
|
||||
/* Emit the dynamic framebuffer state. That is, state that may change per-layer. */
|
||||
|
||||
void *out = framebuffer.cpu;
|
||||
const struct pan_fb_layout *fb = info->fb;
|
||||
const struct pan_fb_load *load = info->load;
|
||||
const struct pan_fb_store *store = info->store;
|
||||
const struct pan_fb_clean_tile ct = pan_fb_get_clean_tile(info);
|
||||
const bool has_zs_crc_ext = pan_fb_has_zs(fb);
|
||||
|
||||
struct pan_fbd_layer fbd_data = {0};
|
||||
fbd_data.tiler = info->tiler_ctx->valhall.desc;
|
||||
|
||||
/* layer_index in flags0 is used to select the right primitive list in
|
||||
* the tiler context, and frame_arg is the value that's passed to the
|
||||
* fragment shader through r62-r63, which we use to pass gl_Layer. Since
|
||||
* the layer_idx only takes 8-bits, we might use the extra 56-bits we
|
||||
* have in frame_argument to pass other information to the fragment
|
||||
* shader at some point.
|
||||
*/
|
||||
assert(info->layer >= info->tiler_ctx->valhall.layer_offset);
|
||||
fbd_data.frame_argument = info->layer;
|
||||
|
||||
pan_pack(&fbd_data.flags0, FRAGMENT_FLAGS_0, cfg) {
|
||||
cfg.pre_frame_0 = pan_fix_frame_shader_mode(info->frame_shaders.modes[0],
|
||||
ct.rts || ct.zs || ct.s);
|
||||
cfg.pre_frame_1 = pan_fix_frame_shader_mode(info->frame_shaders.modes[1],
|
||||
ct.rts || ct.zs || ct.s);
|
||||
cfg.post_frame = info->frame_shaders.modes[2];
|
||||
|
||||
/* Enabling prepass without pipelineing is generally not good for
|
||||
* performance, so disable HSR in that case.
|
||||
*/
|
||||
cfg.hsr_prepass_enable = info->allow_hsr_prepass &&
|
||||
pan_fb_can_pipeline_zs(fb);
|
||||
cfg.hsr_prepass_interleaving_enable = pan_fb_can_pipeline_zs(fb);
|
||||
cfg.hsr_prepass_filter_enable = true;
|
||||
cfg.hsr_hierarchical_optimizations_enable = true;
|
||||
|
||||
cfg.internal_layer_index =
|
||||
info->layer - info->tiler_ctx->valhall.layer_offset;
|
||||
}
|
||||
|
||||
pan_pack(&fbd_data.flags2, FRAGMENT_FLAGS_2, cfg) {
|
||||
if (fb->s_format != PIPE_FORMAT_NONE) {
|
||||
cfg.s_clear = load && target_has_clear(&load->s) ?
|
||||
load->s.clear.stencil : 0;
|
||||
cfg.s_write_enable = store && store->s.store;
|
||||
}
|
||||
|
||||
if (fb->z_format != PIPE_FORMAT_NONE) {
|
||||
cfg.z_internal_format = pan_get_z_internal_format(fb->z_format);
|
||||
cfg.z_write_enable = store && store->zs.store;
|
||||
} else {
|
||||
cfg.z_internal_format = MALI_Z_INTERNAL_FORMAT_D24;
|
||||
assert(!store || !store->zs.store);
|
||||
}
|
||||
}
|
||||
|
||||
fbd_data.z_clear =
|
||||
util_bitpack_float(fb->z_format != PIPE_FORMAT_NONE && load && load &&
|
||||
target_has_clear(&load->z)
|
||||
? load->z.clear.depth
|
||||
: 0);
|
||||
|
||||
fbd_data.dcd_pointer = info->frame_shaders.dcd_pointer;
|
||||
|
||||
{
|
||||
/* Set the DBD and RTD pointers. Both must be 64-bytes aligned. */
|
||||
uint64_t out_gpu_addr =
|
||||
framebuffer.gpu + ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
|
||||
|
||||
if (has_zs_crc_ext) {
|
||||
fbd_data.dbd_pointer = out_gpu_addr;
|
||||
assert(fbd_data.dbd_pointer % 64 == 0);
|
||||
out_gpu_addr += pan_size(ZS_CRC_EXTENSION);
|
||||
}
|
||||
|
||||
fbd_data.rtd_pointer = out_gpu_addr;
|
||||
assert(fbd_data.rtd_pointer % 64 == 0);
|
||||
}
|
||||
|
||||
memcpy(out, &fbd_data, sizeof(fbd_data));
|
||||
out += ALIGN_POT(sizeof(fbd_data), 64);
|
||||
|
||||
if (has_zs_crc_ext) {
|
||||
struct mali_zs_crc_extension_packed zs_crc;
|
||||
emit_zs_crc_desc(info, ct, &zs_crc);
|
||||
memcpy(out, &zs_crc, sizeof(zs_crc));
|
||||
out += sizeof(zs_crc);
|
||||
}
|
||||
|
||||
uint32_t tile_rt_offset_B = 0;
|
||||
for (unsigned rt = 0; rt < fb->rt_count; rt++) {
|
||||
struct mali_rgb_render_target_packed rgb_rt;
|
||||
emit_rgb_rt_desc(info, ct, rt, tile_rt_offset_B, &rgb_rt);
|
||||
memcpy(out, &rgb_rt, sizeof(rgb_rt));
|
||||
out += sizeof(rgb_rt);
|
||||
|
||||
if (fb->rt_formats[rt] != PIPE_FORMAT_NONE) {
|
||||
tile_rt_offset_B += pan_bytes_per_pixel_tib(fb->rt_formats[rt]) *
|
||||
fb->tile_size_px * fb->sample_count;
|
||||
}
|
||||
}
|
||||
assert(tile_rt_offset_B <= fb->tile_rt_alloc_B);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else /* PAN_ARCH < 14 */
|
||||
uint32_t
|
||||
GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info,
|
||||
const struct pan_ptr framebuffer)
|
||||
{
|
||||
void *out = framebuffer.cpu;
|
||||
const struct pan_fb_layout *fb = info->fb;
|
||||
const struct pan_fb_load *load = info->load;
|
||||
const struct pan_fb_store *store = info->store;
|
||||
|
|
@ -823,4 +939,5 @@ GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info, void *out)
|
|||
}
|
||||
return tag.opaque[0];
|
||||
}
|
||||
#endif
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
#endif /* PAN_ARCH >= 5 */
|
||||
|
|
|
|||
|
|
@ -1,14 +1,20 @@
|
|||
/*
|
||||
* Copyright (C) 2026 Collabora, Ltd.
|
||||
* Copyright (C) 2026 Arm Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#ifndef __PAN_FB_H
|
||||
#define __PAN_FB_H
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
#include "genxml/cs_builder.h"
|
||||
#endif
|
||||
|
||||
#include "compiler/shader_enums.h"
|
||||
#include "genxml/gen_macros.h"
|
||||
#include "util/format/u_formats.h"
|
||||
#include "compiler/shader_enums.h"
|
||||
#include "pan_pool.h"
|
||||
|
||||
struct nir_shader;
|
||||
struct nir_shader_compiler_options;
|
||||
|
|
@ -481,7 +487,7 @@ void GENX(pan_fill_fb_info)(const struct pan_fb_desc_info *info,
|
|||
struct pan_fb_info *fbinfo);
|
||||
|
||||
uint32_t GENX(pan_emit_fb_desc)(const struct pan_fb_desc_info *info,
|
||||
void *out);
|
||||
const struct pan_ptr framebuffer);
|
||||
#endif
|
||||
|
||||
enum ENUM_PACKED pan_fb_shader_op {
|
||||
|
|
@ -620,4 +626,35 @@ GENX(pan_get_fb_shader)(const struct pan_fb_shader_key *key,
|
|||
const struct nir_shader_compiler_options *nir_options);
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
/* Framebuffer per-layer state. Keep this structure 64-byte aligned, since
|
||||
* we want the adjacent ZS_CRC_EXTENSION and RENDER_TARGET descriptors
|
||||
* aligned. */
|
||||
struct pan_fbd_layer {
|
||||
/** GPU address to the tiler descriptor. */
|
||||
uint64_t tiler;
|
||||
|
||||
/** Frame argument. */
|
||||
uint64_t frame_argument;
|
||||
|
||||
/** An instance of Fragment Flags 0. */
|
||||
struct mali_fragment_flags_0_packed flags0;
|
||||
|
||||
/** An instance of Fragment Flags 2. */
|
||||
struct mali_fragment_flags_2_packed flags2;
|
||||
|
||||
/** Z clear value. */
|
||||
uint32_t z_clear;
|
||||
|
||||
/** GPU address to the draw call descriptors. It may be 0. */
|
||||
uint64_t dcd_pointer;
|
||||
|
||||
/** GPU address to the ZS_CRC_EXTENSION descriptor. It may be 0. */
|
||||
uint64_t dbd_pointer;
|
||||
|
||||
/** GPU address to the RENDER_TARGET descriptors. */
|
||||
uint64_t rtd_pointer;
|
||||
} __attribute__((aligned(64)));
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
|
||||
#endif /* __PAN_FB_H */
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
/*
|
||||
* Copyright (C) 2019 Collabora, Ltd.
|
||||
* Copyright (C) 2026 Arm Ltd.
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
|
|
@ -184,7 +185,27 @@ const struct pan_blendable_format
|
|||
const struct pan_format GENX(pan_pipe_format)[PIPE_FORMAT_COUNT] = {
|
||||
FMT(NONE, CONSTANT, 0000, L, VTR_IB),
|
||||
|
||||
#if PAN_ARCH >= 7
|
||||
#if PAN_ARCH >= 14
|
||||
/* Multiplane formats */
|
||||
FMT_YUV(R8G8_R8B8_UNORM, Y8U8Y8V8_422, UVYA, NO_SWAP, CENTER_422, _T____),
|
||||
FMT_YUV(G8R8_B8R8_UNORM, U8Y8V8Y8_422, UYVA, SWAP, CENTER_422, _T____),
|
||||
FMT_YUV(R8B8_R8G8_UNORM, Y8U8Y8V8_422, VYUA, NO_SWAP, CENTER_422, _T____),
|
||||
FMT_YUV(B8R8_G8R8_UNORM, U8Y8V8Y8_422, VUYA, SWAP, CENTER_422, _T____),
|
||||
FMT_YUV(R8_G8B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____),
|
||||
FMT_YUV(R8_B8G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____),
|
||||
FMT_YUV(R8_G8_B8_420_UNORM, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____),
|
||||
FMT_YUV(R8_B8_G8_420_UNORM, Y8U8V8_420, YVUA, NO_SWAP, CENTER, _T____),
|
||||
|
||||
FMT_YUV(R8_G8B8_422_UNORM, Y8U8Y8V8_422, YUVA, NO_SWAP, CENTER_422, _T____),
|
||||
FMT_YUV(R8_B8G8_422_UNORM, U8Y8V8Y8_422, YVUA, NO_SWAP, CENTER_422, _T____),
|
||||
|
||||
FMT_YUV(R10_G10B10_420_UNORM, YUYAAYVYAA_420, YUVA, NO_SWAP, CENTER, _T____),
|
||||
FMT_YUV(R10_G10B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, YUVA, NO_SWAP, CENTER_422, _T____),
|
||||
/* special internal formats */
|
||||
FMT_YUV(R8G8B8_420_UNORM_PACKED, Y8U8V8_420, YUVA, NO_SWAP, CENTER, _T____),
|
||||
FMT_YUV(R10G10B10_420_UNORM_PACKED, Y10U10V10_420, YUVA, NO_SWAP, CENTER, _T____),
|
||||
FMT_YUV(X6R10X6G10_X6R10X6B10_422_UNORM, Y10X6U10X6Y10X6V10X6_422, UVYA, NO_SWAP, CENTER_422, _T____),
|
||||
#elif PAN_ARCH >= 7
|
||||
/* Multiplane formats */
|
||||
FMT_YUV(R8G8_R8B8_UNORM, YUYV8, UVYA, NO_SWAP, CENTER_422, _T____),
|
||||
FMT_YUV(G8R8_B8R8_UNORM, VYUY8, UYVA, SWAP, CENTER_422, _T____),
|
||||
|
|
|
|||
|
|
@ -168,6 +168,10 @@ extern const struct pan_blendable_format
|
|||
pan_blendable_formats_v12[PIPE_FORMAT_COUNT];
|
||||
extern const struct pan_blendable_format
|
||||
pan_blendable_formats_v13[PIPE_FORMAT_COUNT];
|
||||
extern const struct pan_blendable_format
|
||||
pan_blendable_formats_v14[PIPE_FORMAT_COUNT];
|
||||
extern const struct pan_blendable_format
|
||||
pan_blendable_formats_v15[PIPE_FORMAT_COUNT];
|
||||
|
||||
uint8_t pan_raw_format_mask_midgard(enum pipe_format *formats);
|
||||
|
||||
|
|
@ -184,6 +188,8 @@ pan_blendable_format_table(unsigned arch)
|
|||
FMT_TABLE(10);
|
||||
FMT_TABLE(12);
|
||||
FMT_TABLE(13);
|
||||
FMT_TABLE(14);
|
||||
FMT_TABLE(15);
|
||||
#undef FMT_TABLE
|
||||
default:
|
||||
assert(!"Unsupported architecture");
|
||||
|
|
@ -199,6 +205,8 @@ extern const struct pan_format pan_pipe_format_v9[PIPE_FORMAT_COUNT];
|
|||
extern const struct pan_format pan_pipe_format_v10[PIPE_FORMAT_COUNT];
|
||||
extern const struct pan_format pan_pipe_format_v12[PIPE_FORMAT_COUNT];
|
||||
extern const struct pan_format pan_pipe_format_v13[PIPE_FORMAT_COUNT];
|
||||
extern const struct pan_format pan_pipe_format_v14[PIPE_FORMAT_COUNT];
|
||||
extern const struct pan_format pan_pipe_format_v15[PIPE_FORMAT_COUNT];
|
||||
|
||||
static inline const struct pan_format *
|
||||
pan_format_table(unsigned arch)
|
||||
|
|
@ -213,6 +221,8 @@ pan_format_table(unsigned arch)
|
|||
FMT_TABLE(10);
|
||||
FMT_TABLE(12);
|
||||
FMT_TABLE(13);
|
||||
FMT_TABLE(14);
|
||||
FMT_TABLE(15);
|
||||
#undef FMT_TABLE
|
||||
default:
|
||||
assert(!"Unsupported architecture");
|
||||
|
|
|
|||
|
|
@ -84,6 +84,8 @@ const struct pan_mod_handler *pan_mod_get_handler_v9(uint64_t modifier);
|
|||
const struct pan_mod_handler *pan_mod_get_handler_v10(uint64_t modifier);
|
||||
const struct pan_mod_handler *pan_mod_get_handler_v12(uint64_t modifier);
|
||||
const struct pan_mod_handler *pan_mod_get_handler_v13(uint64_t modifier);
|
||||
const struct pan_mod_handler *pan_mod_get_handler_v14(uint64_t modifier);
|
||||
const struct pan_mod_handler *pan_mod_get_handler_v15(uint64_t modifier);
|
||||
|
||||
static inline const struct pan_mod_handler *
|
||||
pan_mod_get_handler(unsigned arch, uint64_t modifier)
|
||||
|
|
@ -105,6 +107,10 @@ pan_mod_get_handler(unsigned arch, uint64_t modifier)
|
|||
return pan_mod_get_handler_v12(modifier);
|
||||
case 13:
|
||||
return pan_mod_get_handler_v13(modifier);
|
||||
case 14:
|
||||
return pan_mod_get_handler_v14(modifier);
|
||||
case 15:
|
||||
return pan_mod_get_handler_v15(modifier);
|
||||
default:
|
||||
UNREACHABLE("Unsupported arch");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -70,6 +70,15 @@ pan_compute_max_thread_count(const struct pan_kmod_dev_props *props,
|
|||
aligned_reg_count = work_reg_count <= 32 ? 32 : 64;
|
||||
}
|
||||
|
||||
if (pan_arch(props->gpu_id) >= 15) {
|
||||
assert(props->num_threads_active_granularity);
|
||||
unsigned max_treads_per_wg =
|
||||
ROUND_DOWN_TO(props->num_registers_per_core / aligned_reg_count,
|
||||
props->num_threads_active_granularity);
|
||||
return MIN2(max_treads_per_wg, props->max_threads_per_core);
|
||||
}
|
||||
|
||||
assert(props->max_threads_per_wg);
|
||||
return MIN3(props->max_threads_per_wg, props->max_threads_per_core,
|
||||
props->num_registers_per_core / aligned_reg_count);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -223,6 +223,25 @@ pan_clump_format(enum pipe_format format)
|
|||
/* YUV-sampling has special cases */
|
||||
if (pan_format_is_yuv(format)) {
|
||||
switch (format) {
|
||||
#if PAN_ARCH >= 14
|
||||
case PIPE_FORMAT_R8G8_R8B8_UNORM:
|
||||
case PIPE_FORMAT_G8R8_B8R8_UNORM:
|
||||
case PIPE_FORMAT_R8B8_R8G8_UNORM:
|
||||
case PIPE_FORMAT_B8R8_G8R8_UNORM:
|
||||
case PIPE_FORMAT_R8_G8B8_422_UNORM:
|
||||
case PIPE_FORMAT_R8_B8G8_422_UNORM:
|
||||
case PIPE_FORMAT_R8_G8B8_420_UNORM:
|
||||
case PIPE_FORMAT_R8_B8G8_420_UNORM:
|
||||
case PIPE_FORMAT_R8_G8_B8_420_UNORM:
|
||||
case PIPE_FORMAT_R8_B8_G8_420_UNORM:
|
||||
case PIPE_FORMAT_R8G8B8_420_UNORM_PACKED:
|
||||
return MALI_CLUMP_FORMAT_RAW8;
|
||||
case PIPE_FORMAT_R10_G10B10_420_UNORM:
|
||||
case PIPE_FORMAT_R10G10B10_420_UNORM_PACKED:
|
||||
case PIPE_FORMAT_R10_G10B10_422_UNORM:
|
||||
case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM:
|
||||
return MALI_CLUMP_FORMAT_R10_PACKED;
|
||||
#else
|
||||
case PIPE_FORMAT_R8G8_R8B8_UNORM:
|
||||
case PIPE_FORMAT_G8R8_B8R8_UNORM:
|
||||
case PIPE_FORMAT_R8B8_R8G8_UNORM:
|
||||
|
|
@ -242,6 +261,7 @@ pan_clump_format(enum pipe_format format)
|
|||
case PIPE_FORMAT_R10_G10B10_422_UNORM:
|
||||
case PIPE_FORMAT_X6R10X6G10_X6R10X6B10_422_UNORM:
|
||||
return MALI_CLUMP_FORMAT_Y10_UV10_422;
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
default:
|
||||
UNREACHABLE("unhandled clump format");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,10 @@
|
|||
#include "libpan_v12.h"
|
||||
#elif (PAN_ARCH == 13)
|
||||
#include "libpan_v13.h"
|
||||
#elif (PAN_ARCH == 14)
|
||||
#include "libpan_v14.h"
|
||||
#elif (PAN_ARCH == 15)
|
||||
#include "libpan_v15.h"
|
||||
#else
|
||||
#error "Unsupported architecture for libpan"
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -26,6 +26,10 @@
|
|||
#include "libpan_shaders_v12.h"
|
||||
#elif (PAN_ARCH == 13)
|
||||
#include "libpan_shaders_v13.h"
|
||||
#elif (PAN_ARCH == 14)
|
||||
#include "libpan_shaders_v14.h"
|
||||
#elif (PAN_ARCH == 15)
|
||||
#include "libpan_shaders_v15.h"
|
||||
#else
|
||||
#error "Unsupported architecture for libpan"
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ libpan_shader_files = files(
|
|||
|
||||
idep_libpan_per_arch = {}
|
||||
|
||||
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13']
|
||||
foreach ver : ['4', '5', '6', '7', '9', '10', '12', '13', '14', '15']
|
||||
libpan_spv = custom_target(
|
||||
input : libpan_shader_files,
|
||||
output : 'libpan_v' + ver + '.spv',
|
||||
|
|
|
|||
|
|
@ -95,6 +95,14 @@ const struct pan_model pan_model_list[] = {
|
|||
MODEL_RATES(4, 8, 128)),
|
||||
FIFTHGEN_MODEL(PAN_PROD_ID(13, 8, 0), 4, "G725", "TKRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
|
||||
MODEL_RATES(4, 8, 128)),
|
||||
FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 1, "G1-Pro", "TDRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
|
||||
MODEL_RATES(4, 8, 64)),
|
||||
FIFTHGEN_MODEL(PAN_PROD_ID(14, 8, 3), 4, "G1-Pro", "TDRx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
|
||||
MODEL_RATES(4, 8, 128)),
|
||||
FIFTHGEN_MODEL(PAN_PROD_ID(15, 8, 3), 0, "TMAx", "TMAx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
|
||||
MODEL_RATES(4, 8, 64)),
|
||||
FIFTHGEN_MODEL(PAN_PROD_ID(15, 8, 3), 4, "TMAx", "TMAx", MODEL_ANISO(ALL), MODEL_TB_SIZES(65536, 65536),
|
||||
MODEL_RATES(4, 8, 128)),
|
||||
};
|
||||
/* clang-format on */
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,15 @@ struct pan_tiler_features {
|
|||
#define PAN_VERSION_MINOR(x) (((x) & BITFIELD_RANGE(4, 8)) >> 4)
|
||||
#define PAN_VERSION_STATUS(x) ((x) & BITFIELD_RANGE(0, 4))
|
||||
|
||||
#define PAN_ID64_COMPAT 0xFull
|
||||
#define PAN_ID64_ARCH_MAJOR(x) (((x) & BITFIELD64_RANGE(56, 8)) >> 56)
|
||||
#define PAN_ID64_ARCH_MINOR(x) (((x) & BITFIELD64_RANGE(48, 8)) >> 48)
|
||||
#define PAN_ID64_ARCH_REV(x) (((x) & BITFIELD64_RANGE(40, 8)) >> 40)
|
||||
#define PAN_ID64_PRODUCT_MAJOR(x) (((x) & BITFIELD64_RANGE(32, 8)) >> 32)
|
||||
#define PAN_ID64_VERSION_MAJOR(x) (((x) & BITFIELD64_RANGE(16, 8)) >> 16)
|
||||
#define PAN_ID64_VERSION_MINOR(x) (((x) & BITFIELD64_RANGE(8, 8)) >> 8)
|
||||
#define PAN_ID64_VERSION_STATUS(x) ((x) & BITFIELD64_RANGE(0, 8))
|
||||
|
||||
/* GPU product id for Midgard */
|
||||
#define MIDGARD_PROD_ID(x) (((x) & BITFIELD_RANGE(16, 16)) >> 16)
|
||||
|
||||
|
|
@ -108,8 +117,12 @@ pan_arch(uint64_t gpu_id)
|
|||
case 0x860:
|
||||
case 0x880:
|
||||
return 5;
|
||||
default:
|
||||
return PAN_ARCH_MAJOR(gpu_id);
|
||||
default: {
|
||||
unsigned gpu_arch = PAN_ARCH_MAJOR(gpu_id);
|
||||
if (gpu_arch == PAN_ID64_COMPAT)
|
||||
return PAN_ID64_ARCH_MAJOR(gpu_id);
|
||||
return gpu_arch;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -119,14 +132,21 @@ pan_prod_id(uint64_t gpu_id)
|
|||
unsigned arch = pan_arch(gpu_id);
|
||||
if (arch < 6)
|
||||
return MIDGARD_PROD_ID(gpu_id);
|
||||
return PAN_PROD_ID(PAN_ARCH_MAJOR(gpu_id), PAN_ARCH_MINOR(gpu_id),
|
||||
PAN_PRODUCT_MAJOR(gpu_id));
|
||||
else if (arch < PAN_ID64_COMPAT)
|
||||
return PAN_PROD_ID(PAN_ARCH_MAJOR(gpu_id), PAN_ARCH_MINOR(gpu_id),
|
||||
PAN_PRODUCT_MAJOR(gpu_id));
|
||||
return PAN_PROD_ID(PAN_ID64_ARCH_MAJOR(gpu_id), PAN_ID64_ARCH_MINOR(gpu_id),
|
||||
PAN_ID64_PRODUCT_MAJOR(gpu_id));
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
pan_rev(uint64_t gpu_id)
|
||||
{
|
||||
return PAN_REV(PAN_VERSION_MAJOR(gpu_id), PAN_VERSION_MINOR(gpu_id));
|
||||
unsigned arch = pan_arch(gpu_id);
|
||||
if (arch < PAN_ID64_COMPAT)
|
||||
return PAN_REV(PAN_VERSION_MAJOR(gpu_id), PAN_VERSION_MINOR(gpu_id));
|
||||
return PAN_REV(PAN_ID64_VERSION_MAJOR(gpu_id),
|
||||
PAN_ID64_VERSION_MINOR(gpu_id));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -74,7 +74,11 @@ static inline uint32_t
|
|||
get_fbd_size(bool has_zs_ext, uint32_t rt_count)
|
||||
{
|
||||
assert(rt_count >= 1 && rt_count <= MAX_RTS);
|
||||
#if PAN_ARCH >= 14
|
||||
uint32_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
|
||||
#else
|
||||
uint32_t fbd_size = pan_size(FRAMEBUFFER);
|
||||
#endif
|
||||
if (has_zs_ext)
|
||||
fbd_size += pan_size(ZS_CRC_EXTENSION);
|
||||
fbd_size += pan_size(RENDER_TARGET) * rt_count;
|
||||
|
|
@ -209,13 +213,27 @@ enum panvk_cs_regs {
|
|||
PANVK_CS_REG_RUN_IDVS_SR_END = 60,
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
/* RUN_FRAGMENT2 staging regs.
|
||||
* SW ABI:
|
||||
* - r54:55 contain the pointer to the current FBD layer state.
|
||||
* - r58:59 contain the pointer to the first tiler descriptor. This is
|
||||
* needed to gather completed heap chunks after a run_fragment2.
|
||||
*/
|
||||
PANVK_CS_REG_RUN_FRAGMENT_SR_START = 0,
|
||||
PANVK_CS_REG_RUN_FRAGMENT_SR_END = 55,
|
||||
PANVK_CS_REG_FBD_LAYER_PTR = 54,
|
||||
PANVK_CS_REG_TILER_DESC_PTR = 58,
|
||||
#else
|
||||
/* RUN_FRAGMENT staging regs.
|
||||
* SW ABI:
|
||||
* - r38:39 contain the pointer to the first tiler descriptor. This is
|
||||
* - r58:59 contain the pointer to the first tiler descriptor. This is
|
||||
* needed to gather completed heap chunks after a run_fragment.
|
||||
*/
|
||||
PANVK_CS_REG_RUN_FRAGMENT_SR_START = 38,
|
||||
PANVK_CS_REG_RUN_FRAGMENT_SR_END = 46,
|
||||
PANVK_CS_REG_TILER_DESC_PTR = 58,
|
||||
#endif
|
||||
|
||||
/* RUN_COMPUTE staging regs. */
|
||||
PANVK_CS_REG_RUN_COMPUTE_SR_START = 0,
|
||||
|
|
@ -870,4 +888,31 @@ vk_stages_to_subqueue_mask(VkPipelineStageFlags2 vk_stages,
|
|||
void panvk_per_arch(emit_barrier)(struct panvk_cmd_buffer *cmdbuf,
|
||||
struct panvk_cs_deps deps);
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
static inline void
|
||||
cs_emit_layer_fragment_state(struct cs_builder *b, struct cs_index fbd_ptr)
|
||||
{
|
||||
/* Emit the dynamic fragment state. This state may change per-layer. */
|
||||
|
||||
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_0), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, flags0));
|
||||
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_2), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, flags2));
|
||||
cs_load32_to(b, cs_sr_reg32(b, FRAGMENT, Z_CLEAR), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, z_clear));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, TILER_DESCRIPTOR_POINTER), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, tiler));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, RTD_POINTER), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, rtd_pointer));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, DBD_POINTER), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, dbd_pointer));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_ARG), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, frame_argument));
|
||||
cs_load64_to(b, cs_sr_reg64(b, FRAGMENT, FRAME_SHADER_DCD_POINTER), fbd_ptr,
|
||||
offsetof(struct pan_fbd_layer, dcd_pointer));
|
||||
|
||||
cs_flush_loads(b);
|
||||
}
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
|
||||
#endif /* PANVK_CMD_BUFFER_H */
|
||||
|
|
|
|||
|
|
@ -89,8 +89,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
|
|||
unsigned core_id_range;
|
||||
pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
|
||||
|
||||
tlsinfo.wls.instances = pan_calc_wls_instances(
|
||||
&cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
|
||||
tlsinfo.wls.instances =
|
||||
pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
|
||||
indirect ? NULL : dim, cs->info.work_reg_count);
|
||||
|
||||
unsigned wls_total_size = pan_calc_total_wls_size(
|
||||
tlsinfo.wls.size, tlsinfo.wls.instances, core_id_range);
|
||||
|
|
@ -156,7 +157,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
|
|||
unsigned wg_per_task = 0;
|
||||
if (indirect)
|
||||
wg_per_task = pan_calc_workgroups_per_task(&cs->cs.local_size,
|
||||
&phys_dev->kmod.dev->props);
|
||||
&phys_dev->kmod.dev->props,
|
||||
cs->info.work_reg_count);
|
||||
|
||||
if (compute_state_dirty(cmdbuf, DESC_STATE) ||
|
||||
compute_state_dirty(cmdbuf, CS)) {
|
||||
|
|
@ -207,9 +209,20 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info)
|
|||
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_FAU), fau_ptr);
|
||||
}
|
||||
|
||||
if (compute_state_dirty(cmdbuf, CS))
|
||||
if (compute_state_dirty(cmdbuf, CS)) {
|
||||
#if PAN_ARCH >= 15
|
||||
struct mali_shader_program_pointer_packed spp;
|
||||
pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
|
||||
ctx.register_count = cs->info.work_reg_count;
|
||||
ctx.pointer = panvk_priv_mem_dev_addr(cs->spd);
|
||||
}
|
||||
uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
|
||||
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD), ptr);
|
||||
#else
|
||||
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_SPD),
|
||||
panvk_priv_mem_dev_addr(cs->spd));
|
||||
#endif
|
||||
}
|
||||
|
||||
cs_move64_to(b, cs_reg64(b, PANVK_COMPUTE_TSD), tsd);
|
||||
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@
|
|||
#include "vk_render_pass.h"
|
||||
#include "poly/geometry.h"
|
||||
|
||||
#if PAN_ARCH < 14
|
||||
static enum cs_reg_perm
|
||||
provoking_vertex_fn_reg_perm_cb(struct cs_builder *b, unsigned reg)
|
||||
{
|
||||
|
|
@ -202,6 +203,7 @@ panvk_per_arch(device_draw_context_cleanup)(struct panvk_device *dev)
|
|||
panvk_priv_bo_unref(dev->draw_ctx->fns_bo);
|
||||
vk_free(&dev->vk.alloc, dev->draw_ctx);
|
||||
}
|
||||
#endif /* PAN_ARCH < 14 */
|
||||
|
||||
static void
|
||||
emit_vs_attrib(struct panvk_cmd_buffer *cmdbuf,
|
||||
|
|
@ -1245,8 +1247,13 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
|
|||
uint32_t fbd_sz = calc_fbd_size(cmdbuf);
|
||||
uint32_t fbds_sz = enabled_layer_count * fbd_sz;
|
||||
|
||||
cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
|
||||
cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
|
||||
#if PAN_ARCH >= 14
|
||||
const unsigned fbds_alignment = alignof(struct pan_fbd_layer);
|
||||
#else
|
||||
const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER);
|
||||
#endif
|
||||
cmdbuf->state.gfx.render.fbds =
|
||||
panvk_cmd_alloc_dev_mem(cmdbuf, desc, fbds_sz, fbds_alignment);
|
||||
if (!cmdbuf->state.gfx.render.fbds.gpu)
|
||||
return VK_ERROR_OUT_OF_DEVICE_MEMORY;
|
||||
|
||||
|
|
@ -1316,14 +1323,23 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
|
|||
tiler_ctx = get_tiler_context(cmdbuf, layer_idx);
|
||||
|
||||
uint32_t new_fbd_flags =
|
||||
GENX(pan_emit_fb_desc)(&fbd_info, fbds.cpu + fbd_sz * i);
|
||||
GENX(pan_emit_fb_desc)(&fbd_info, pan_ptr_offset(fbds, fbd_sz * i));
|
||||
|
||||
/* Make sure all FBDs have the same flags. */
|
||||
assert(i == 0 || new_fbd_flags == fbd_flags);
|
||||
fbd_flags = new_fbd_flags;
|
||||
}
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
/* fbd_flags is unused on v14+. */
|
||||
assert(!fbd_flags);
|
||||
#endif
|
||||
|
||||
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
// TODO: Implement IR support for v14.
|
||||
#else
|
||||
for (uint32_t ir_pass = 0; ir_pass < PANVK_IR_PASS_COUNT; ir_pass++) {
|
||||
struct pan_ptr ir_fbds = panvk_cmd_alloc_dev_mem(
|
||||
cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
|
||||
|
|
@ -1335,7 +1351,6 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
|
|||
|
||||
for (uint32_t i = 0; i < enabled_layer_count; i++) {
|
||||
uint32_t layer_idx = multiview ? u_bit_scan(&ir_view_mask_temp) : i;
|
||||
void *ir_fbd = (void *)((uint8_t *)ir_fbds.cpu + (i * fbd_sz));
|
||||
|
||||
fbd_info.layer = layer_idx;
|
||||
tiler_ctx = get_tiler_context(cmdbuf, layer_idx);
|
||||
|
|
@ -1353,8 +1368,8 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
|
|||
if (result != VK_SUCCESS)
|
||||
return result;
|
||||
|
||||
ASSERTED uint32_t new_fbd_flags =
|
||||
GENX(pan_emit_fb_desc)(&fbd_info, ir_fbd);
|
||||
ASSERTED uint32_t new_fbd_flags = GENX(pan_emit_fb_desc)(
|
||||
&fbd_info, pan_ptr_offset(ir_fbds, fbd_sz * i));
|
||||
|
||||
/* Make sure all FBDs have the same flags. */
|
||||
assert(new_fbd_flags == fbd_flags);
|
||||
|
|
@ -1367,16 +1382,18 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
|
|||
|
||||
/* Wait for IR info push to complete */
|
||||
cs_wait_slot(b, SB_ID(LS));
|
||||
|
||||
bool unset_provoking_vertex =
|
||||
cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET;
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
|
||||
if (copy_fbds) {
|
||||
struct cs_index cur_tiler = cs_reg64(b, 38);
|
||||
struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR);
|
||||
#if PAN_ARCH >= 14
|
||||
struct cs_index dst_fbd_ptr = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
|
||||
#else
|
||||
struct cs_index dst_fbd_ptr = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
|
||||
struct cs_index fbd_idx = cs_reg32(b, 47);
|
||||
struct cs_index src_fbd_ptr = cs_reg64(b, 48);
|
||||
struct cs_index remaining_layers_in_td = cs_reg32(b, 50);
|
||||
#endif
|
||||
struct cs_index fbd_idx = cs_reg32(b, 60);
|
||||
struct cs_index src_fbd_ptr = cs_reg64(b, 64);
|
||||
struct cs_index remaining_layers_in_td = cs_reg32(b, 61);
|
||||
uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
|
||||
MAX_LAYERS_PER_TILER_DESC);
|
||||
|
||||
|
|
@ -1400,10 +1417,27 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
|
|||
* framebuffer size is aligned on 64-bytes. */
|
||||
assert(fbd_sz == ALIGN_POT(fbd_sz, 64));
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
|
||||
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr,
|
||||
BITFIELD_MASK(16), fbd_off);
|
||||
|
||||
/* Patch the Tiler pointer. */
|
||||
if (fbd_off == 0)
|
||||
cs_add64(b, cs_scratch_reg64(b, 0), cur_tiler, 0);
|
||||
|
||||
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr,
|
||||
BITFIELD_MASK(16), fbd_off);
|
||||
}
|
||||
#else
|
||||
bool unset_provoking_vertex =
|
||||
cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET;
|
||||
for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
|
||||
if (fbd_off == 0) {
|
||||
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14), src_fbd_ptr,
|
||||
BITFIELD_MASK(14), fbd_off);
|
||||
|
||||
/* Patch the Tiler pointer. */
|
||||
cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0);
|
||||
|
||||
/* If we don't know what provoking vertex mode the
|
||||
|
|
@ -1423,6 +1457,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
|
|||
cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr,
|
||||
BITFIELD_MASK(16), fbd_off);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Finish stores to pass_dst_fbd_ptr. */
|
||||
cs_flush_stores(b);
|
||||
|
|
@ -1456,12 +1491,19 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
|
|||
-(full_td_count * pan_size(TILER_CONTEXT)));
|
||||
}
|
||||
} else {
|
||||
#if PAN_ARCH >= 14
|
||||
struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
|
||||
#else
|
||||
struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
|
||||
#endif
|
||||
|
||||
cs_update_frag_ctx(b) {
|
||||
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
|
||||
fbds.gpu | fbd_flags);
|
||||
cs_move64_to(b, cs_reg64(b, 38), cmdbuf->state.gfx.render.tiler);
|
||||
cs_move64_to(b, fbd_pointer, fbds.gpu | fbd_flags);
|
||||
cs_move64_to(b, cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR),
|
||||
cmdbuf->state.gfx.render.tiler);
|
||||
}
|
||||
|
||||
#if PAN_ARCH < 14
|
||||
/* If we don't know what provoking vertex mode the application wants yet,
|
||||
* leave space to patch it later */
|
||||
if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) {
|
||||
|
|
@ -1483,6 +1525,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
|
|||
cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex)
|
||||
cs_call(b, addr_reg, length_reg);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
return VK_SUCCESS;
|
||||
|
|
@ -3299,6 +3342,9 @@ calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf)
|
|||
static void
|
||||
setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
|
||||
{
|
||||
#if PAN_ARCH >= 14
|
||||
// TODO: Implement IR support for v14.
|
||||
#else
|
||||
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
|
||||
const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
|
||||
const bool has_zs_ext = pan_fb_has_zs(fb);
|
||||
|
|
@ -3343,6 +3389,7 @@ setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
|
|||
TILER_OOM_CTX_FIELD_OFFSET(layer_count));
|
||||
|
||||
cs_flush_stores(b);
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
|
|
@ -3351,17 +3398,98 @@ pack_32_2x16(uint16_t lo, uint16_t hi)
|
|||
return (((uint32_t)hi) << 16) | (uint32_t)lo;
|
||||
}
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
static void
|
||||
cs_emit_static_fragment_state(struct cs_builder *b,
|
||||
struct panvk_cmd_buffer *cmdbuf)
|
||||
{
|
||||
/* Emit the static fragment staging registers. These don't change per-layer. */
|
||||
|
||||
const struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
||||
const struct panvk_rendering_state *render = &cmdbuf->state.gfx.render;
|
||||
const struct pan_fb_layout *fb = &render->fb.layout;
|
||||
|
||||
const uint8_t sample_count = render->fb.layout.sample_count;
|
||||
|
||||
const struct pan_fb_bbox fb_area_px =
|
||||
pan_fb_bbox_from_xywh(0, 0, fb->width_px, fb->height_px);
|
||||
const struct pan_fb_bbox bbox_px =
|
||||
pan_fb_bbox_clamp(fb->tiling_area_px, fb_area_px);
|
||||
|
||||
assert(pan_fb_bbox_is_valid(fb->tiling_area_px));
|
||||
|
||||
struct mali_fragment_bounding_box_packed bbox;
|
||||
pan_pack(&bbox, FRAGMENT_BOUNDING_BOX, cfg) {
|
||||
cfg.bound_min_x = bbox_px.min_x;
|
||||
cfg.bound_min_y = bbox_px.min_y;
|
||||
cfg.bound_max_x = bbox_px.max_x;
|
||||
cfg.bound_max_y = bbox_px.max_y;
|
||||
}
|
||||
|
||||
struct mali_frame_size_packed frame_size;
|
||||
pan_pack(&frame_size, FRAME_SIZE, cfg) {
|
||||
cfg.width = fb->width_px;
|
||||
cfg.height = fb->height_px;
|
||||
}
|
||||
|
||||
cs_move64_to(b, cs_sr_reg64(b, FRAGMENT, BOUNDING_BOX),
|
||||
bbox.opaque[0] | (uint64_t)bbox.opaque[1] << 32);
|
||||
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FRAME_SIZE), frame_size.opaque[0]);
|
||||
cs_move64_to(
|
||||
b, cs_sr_reg64(b, FRAGMENT, SAMPLE_POSITION_ARRAY_POINTER),
|
||||
dev->sample_positions->addr.dev +
|
||||
pan_sample_positions_offset(pan_sample_pattern(sample_count)));
|
||||
|
||||
/* Flags 1 */
|
||||
struct mali_fragment_flags_1_packed flags1;
|
||||
pan_pack(&flags1, FRAGMENT_FLAGS_1, cfg) {
|
||||
cfg.sample_count = fb->sample_count;
|
||||
cfg.sample_pattern = pan_sample_pattern(fb->sample_count);
|
||||
cfg.effective_tile_size = fb->tile_size_px;
|
||||
cfg.point_sprite_coord_origin_max_y = false;
|
||||
cfg.first_provoking_vertex = get_first_provoking_vertex(cmdbuf);
|
||||
|
||||
assert(fb->rt_count > 0);
|
||||
cfg.render_target_count = fb->rt_count;
|
||||
cfg.color_buffer_allocation = fb->tile_rt_alloc_B;
|
||||
}
|
||||
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1), flags1.opaque[0]);
|
||||
|
||||
/* If we don't know what provoking vertex mode the application wants yet,
|
||||
* leave space to patch it later */
|
||||
if (cmdbuf->state.gfx.render.first_provoking_vertex == U_TRISTATE_UNSET) {
|
||||
cs_maybe(b, &cmdbuf->state.gfx.render.maybe_set_fbds_provoking_vertex)
|
||||
{
|
||||
/* provoking_vertex flag is bit 14 of Fragment Flags 1. */
|
||||
cs_add32(b, cs_sr_reg32(b, FRAGMENT, FLAGS_1),
|
||||
cs_sr_reg32(b, FRAGMENT, FLAGS_1), -(1 << 14));
|
||||
}
|
||||
}
|
||||
|
||||
/* Leave the remaining RUN_FRAGMENT2 staging registers as zero. */
|
||||
}
|
||||
#endif /* PAN_ARCH >= 14 */
|
||||
|
||||
static VkResult
|
||||
issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
|
||||
{
|
||||
#if PAN_ARCH < 14
|
||||
struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
|
||||
#endif
|
||||
const struct cs_tracing_ctx *tracing_ctx =
|
||||
&cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing;
|
||||
const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
|
||||
struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
|
||||
bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
|
||||
|
||||
/* Now initialize the fragment bits. */
|
||||
#if PAN_ARCH >= 14
|
||||
struct cs_index fbd_pointer = cs_reg64(b, PANVK_CS_REG_FBD_LAYER_PTR);
|
||||
cs_update_frag_ctx(b) {
|
||||
cs_emit_static_fragment_state(b, cmdbuf);
|
||||
cs_emit_layer_fragment_state(b, fbd_pointer);
|
||||
}
|
||||
#else
|
||||
const struct pan_fb_layout *fb = &cmdbuf->state.gfx.render.fb.layout;
|
||||
cs_update_frag_ctx(b) {
|
||||
cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
|
||||
pack_32_2x16(fb->tiling_area_px.min_x,
|
||||
|
|
@ -3370,6 +3498,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
|
|||
pack_32_2x16(fb->tiling_area_px.max_x,
|
||||
fb->tiling_area_px.max_y));
|
||||
}
|
||||
#endif
|
||||
|
||||
bool simul_use =
|
||||
cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
|
||||
|
|
@ -3401,6 +3530,9 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
|
|||
* state for this renderpass, so it's safe to enable. */
|
||||
struct cs_index addr_reg = cs_scratch_reg64(b, 0);
|
||||
struct cs_index length_reg = cs_scratch_reg32(b, 2);
|
||||
#if PAN_ARCH >= 14
|
||||
// TODO: Implement IR support for v14.
|
||||
#else
|
||||
uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf);
|
||||
uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev +
|
||||
handler_idx * dev->tiler_oom.handler_stride;
|
||||
|
|
@ -3408,6 +3540,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
|
|||
cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride);
|
||||
cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
|
||||
length_reg);
|
||||
#endif
|
||||
|
||||
/* Wait for the tiling to be done before submitting the fragment job. */
|
||||
wait_finish_tiling(cmdbuf);
|
||||
|
|
@ -3422,8 +3555,12 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
|
|||
* up. */
|
||||
cs_move64_to(b, addr_reg, 0);
|
||||
cs_move32_to(b, length_reg, 0);
|
||||
#if PAN_ARCH >= 14
|
||||
// TODO: Implement IR support for v14.
|
||||
#else
|
||||
cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
|
||||
length_reg);
|
||||
#endif
|
||||
|
||||
/* Applications tend to forget to describe subpass dependencies, especially
|
||||
* when it comes to write -> read dependencies on attachments. The
|
||||
|
|
@ -3439,8 +3576,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
|
|||
}
|
||||
|
||||
if (cmdbuf->state.gfx.render.layer_count <= 1) {
|
||||
#if PAN_ARCH >= 14
|
||||
cs_trace_run_fragment2(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
|
||||
false, MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
#else
|
||||
cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
|
||||
false, MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
#endif
|
||||
} else {
|
||||
struct cs_index run_fragment_regs = cs_scratch_reg_tuple(b, 0, 4);
|
||||
struct cs_index remaining_layers = cs_scratch_reg32(b, 4);
|
||||
|
|
@ -3449,12 +3591,18 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
|
|||
cs_while(b, MALI_CS_CONDITION_GREATER, remaining_layers) {
|
||||
cs_add32(b, remaining_layers, remaining_layers, -1);
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
cs_emit_layer_fragment_state(b, fbd_pointer);
|
||||
cs_trace_run_fragment2(b, tracing_ctx, run_fragment_regs, false,
|
||||
MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
#else
|
||||
cs_trace_run_fragment(b, tracing_ctx, run_fragment_regs, false,
|
||||
MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
struct cs_index fbd_pointer = cs_sr_reg64(b, FRAGMENT, FBD_POINTER);
|
||||
#endif
|
||||
|
||||
cs_update_frag_ctx(b)
|
||||
cs_add64(b, cs_sr_reg64(b, FRAGMENT, FBD_POINTER),
|
||||
cs_sr_reg64(b, FRAGMENT, FBD_POINTER), fbd_sz);
|
||||
cs_add64(b, fbd_pointer, fbd_pointer, fbd_sz);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3468,8 +3616,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
|
|||
struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
|
||||
struct cs_index completed_top = cs_scratch_reg64(b, 10);
|
||||
struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
|
||||
struct cs_index cur_tiler = cs_reg64(b, 38);
|
||||
struct cs_index tiler_count = cs_reg32(b, 47);
|
||||
struct cs_index cur_tiler = cs_reg64(b, PANVK_CS_REG_TILER_DESC_PTR);
|
||||
struct cs_index tiler_count = cs_reg32(b, 60);
|
||||
struct cs_index oq_chain = cs_scratch_reg64(b, 10);
|
||||
struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10);
|
||||
struct cs_index oq_syncobj = cs_scratch_reg64(b, 12);
|
||||
|
|
|
|||
|
|
@ -82,8 +82,18 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
|
|||
uint64_t fau_ptr = push_uniforms.gpu | (fau_count << 56);
|
||||
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_FAU), fau_ptr);
|
||||
|
||||
#if PAN_ARCH >= 15
|
||||
struct mali_shader_program_pointer_packed spp;
|
||||
pan_pack(&spp, SHADER_PROGRAM_POINTER, ctx) {
|
||||
ctx.register_count = shader->info.work_reg_count;
|
||||
ctx.pointer = panvk_priv_mem_dev_addr(shader->spd);
|
||||
}
|
||||
uint64_t ptr = ((uint64_t)spp.opaque[1] << 32) | spp.opaque[0];
|
||||
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD), ptr);
|
||||
#else
|
||||
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_SPD),
|
||||
panvk_priv_mem_dev_addr(shader->spd));
|
||||
#endif
|
||||
|
||||
cs_move64_to(b, cs_reg64(b, PANVK_PRECOMP_TSD), tsd);
|
||||
|
||||
|
|
@ -155,7 +165,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx,
|
|||
* increment/axis parameters requires knowledge of job dimensions, but
|
||||
* this is somewhat offset by run_compute being a native instruction. */
|
||||
task_increment = pan_calc_workgroups_per_task(
|
||||
&shader->cs.local_size, &phys_dev->kmod.dev->props);
|
||||
&shader->cs.local_size, &phys_dev->kmod.dev->props,
|
||||
shader->info.work_reg_count);
|
||||
} else {
|
||||
panvk_per_arch(calculate_task_axis_and_increment)(
|
||||
shader, phys_dev, &dim, &task_axis, &task_increment);
|
||||
|
|
|
|||
|
|
@ -13,8 +13,13 @@ tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg)
|
|||
{
|
||||
switch (reg) {
|
||||
/* The bbox is set up by the fragment subqueue, we should not modify it. */
|
||||
#if PAN_ARCH >= 14
|
||||
case 28:
|
||||
case 29:
|
||||
#else
|
||||
case 42:
|
||||
case 43:
|
||||
#endif
|
||||
/* We should only load from the subqueue context. */
|
||||
case PANVK_CS_REG_SUBQUEUE_CTX_START:
|
||||
case PANVK_CS_REG_SUBQUEUE_CTX_END:
|
||||
|
|
@ -42,8 +47,14 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count,
|
|||
cs_store(b, cs_scratch_reg_tuple(b, 0, 8), dst, BITFIELD_MASK(8),
|
||||
8 * sizeof(uint32_t));
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
const size_t fbd_size = ALIGN_POT(sizeof(struct pan_fbd_layer), 64);
|
||||
#else
|
||||
const size_t fbd_size = sizeof(struct mali_framebuffer_packed);
|
||||
#endif
|
||||
|
||||
if (has_zs_ext) {
|
||||
const uint16_t dbd_offset = sizeof(struct mali_framebuffer_packed);
|
||||
const uint16_t dbd_offset = fbd_size;
|
||||
|
||||
/* Copy the whole DBD. */
|
||||
cs_load_to(b, cs_scratch_reg_tuple(b, 0, 8), src_other,
|
||||
|
|
@ -57,8 +68,7 @@ copy_fbd(struct cs_builder *b, bool has_zs_ext, uint32_t rt_count,
|
|||
}
|
||||
|
||||
const uint16_t rts_offset =
|
||||
sizeof(struct mali_framebuffer_packed) +
|
||||
(has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0);
|
||||
fbd_size + (has_zs_ext ? sizeof(struct mali_zs_crc_extension_packed) : 0);
|
||||
|
||||
for (uint32_t rt = 0; rt < rt_count; rt++) {
|
||||
const uint16_t rt_offset =
|
||||
|
|
@ -110,12 +120,14 @@ generate_tiler_oom_handler(struct panvk_device *dev,
|
|||
.tracebuf_addr_offset =
|
||||
offsetof(struct panvk_cs_subqueue_context, debug.tracebuf.cs),
|
||||
};
|
||||
struct mali_framebuffer_pointer_packed fb_tag;
|
||||
|
||||
#if PAN_ARCH < 14
|
||||
struct mali_framebuffer_pointer_packed fb_tag;
|
||||
pan_pack(&fb_tag, FRAMEBUFFER_POINTER, cfg) {
|
||||
cfg.zs_crc_extension_present = has_zs_ext;
|
||||
cfg.render_target_count = rt_count;
|
||||
}
|
||||
#endif
|
||||
|
||||
cs_function_def(&b, &handler, handler_ctx) {
|
||||
struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b);
|
||||
|
|
@ -140,7 +152,7 @@ generate_tiler_oom_handler(struct panvk_device *dev,
|
|||
struct cs_index run_fragment_regs = cs_scratch_reg_tuple(&b, 0, 4);
|
||||
|
||||
/* The tiler pointer is pre-filled. */
|
||||
struct cs_index tiler_ptr = cs_reg64(&b, 38);
|
||||
struct cs_index tiler_ptr = cs_reg64(&b, PANVK_CS_REG_TILER_DESC_PTR);
|
||||
|
||||
cs_load64_to(&b, scratch_fbd_ptr_reg, subqueue_ctx,
|
||||
TILER_OOM_CTX_FIELD_OFFSET(ir_scratch_fbd_ptr));
|
||||
|
|
@ -175,12 +187,22 @@ generate_tiler_oom_handler(struct panvk_device *dev,
|
|||
/* Flush copies before the RUN_FRAGMENT. */
|
||||
cs_wait_slot(&b, SB_ID(LS));
|
||||
|
||||
#if PAN_ARCH >= 14
|
||||
/* Set FBD pointer to the scratch fbd */
|
||||
struct cs_index fbd_pointer = cs_reg64(&b, PANVK_CS_REG_FBD_LAYER_PTR);
|
||||
cs_add64(&b, fbd_pointer, scratch_fbd_ptr_reg, 0);
|
||||
cs_emit_layer_fragment_state(&b, fbd_pointer);
|
||||
|
||||
cs_trace_run_fragment2(&b, &tracing_ctx, run_fragment_regs, false,
|
||||
MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
#else
|
||||
/* Set FBD pointer to the scratch fbd */
|
||||
cs_add64(&b, cs_sr_reg64(&b, FRAGMENT, FBD_POINTER),
|
||||
scratch_fbd_ptr_reg, fb_tag.opaque[0]);
|
||||
|
||||
cs_trace_run_fragment(&b, &tracing_ctx, run_fragment_regs, false,
|
||||
MALI_TILE_RENDER_ORDER_Z_ORDER);
|
||||
#endif
|
||||
|
||||
/* Serialize run fragments since we reuse FBD for the runs */
|
||||
cs_wait_slots(&b, dev->csf.sb.all_iters_mask);
|
||||
|
|
|
|||
|
|
@ -717,7 +717,12 @@ init_tiler(struct panvk_gpu_queue *queue)
|
|||
tiler_heap->chunk_size = phys_dev->csf.tiler.chunk_size;
|
||||
|
||||
alloc_info.size = get_fbd_size(true, MAX_RTS);
|
||||
alloc_info.alignment = pan_alignment(FRAMEBUFFER);
|
||||
#if PAN_ARCH >= 14
|
||||
const unsigned fbds_alignment = alignof(struct pan_fbd_layer);
|
||||
#else
|
||||
const unsigned fbds_alignment = pan_alignment(FRAMEBUFFER);
|
||||
#endif
|
||||
alloc_info.alignment = fbds_alignment;
|
||||
tiler_heap->oom_fbd = panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info);
|
||||
if (!panvk_priv_mem_check_alloc(tiler_heap->oom_fbd)) {
|
||||
result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY,
|
||||
|
|
|
|||
|
|
@ -181,7 +181,7 @@ panvk_per_arch(cmd_close_batch)(struct panvk_cmd_buffer *cmdbuf)
|
|||
fbd_info.layer = layer_id;
|
||||
fbd_info.frame_shaders = fs;
|
||||
fbd_info.frame_shaders.dcd_pointer += layer_id * 3 * pan_size(DRAW);
|
||||
tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd.cpu);
|
||||
tagged_fbd_ptr |= GENX(pan_emit_fb_desc)(&fbd_info, fbd);
|
||||
|
||||
result = panvk_cmd_prepare_fragment_job(cmdbuf, tagged_fbd_ptr);
|
||||
if (result != VK_SUCCESS)
|
||||
|
|
|
|||
|
|
@ -51,8 +51,9 @@ panvk_per_arch(cmd_dispatch_prepare_tls)(
|
|||
unsigned core_id_range;
|
||||
|
||||
pan_query_core_count(&phys_dev->kmod.dev->props, &core_id_range);
|
||||
batch->tlsinfo.wls.instances = pan_calc_wls_instances(
|
||||
&cs->cs.local_size, &phys_dev->kmod.dev->props, indirect ? NULL : dim);
|
||||
batch->tlsinfo.wls.instances =
|
||||
pan_calc_wls_instances(&cs->cs.local_size, &phys_dev->kmod.dev->props,
|
||||
indirect ? NULL : dim, cs->info.work_reg_count);
|
||||
batch->wls_total_size = pan_calc_total_wls_size(
|
||||
batch->tlsinfo.wls.size, batch->tlsinfo.wls.instances, core_id_range);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ panvk_entrypoints = custom_target(
|
|||
'--device-prefix', 'panvk_v6', '--device-prefix', 'panvk_v7',
|
||||
'--device-prefix', 'panvk_v9', '--device-prefix', 'panvk_v10',
|
||||
'--device-prefix', 'panvk_v12', '--device-prefix', 'panvk_v13',
|
||||
'--device-prefix', 'panvk_v14', '--device-prefix', 'panvk_v15',
|
||||
'--beta', with_vulkan_beta.to_string()
|
||||
],
|
||||
depend_files : vk_entrypoints_gen_depend_files,
|
||||
|
|
@ -65,7 +66,7 @@ valhall_archs = [9, 10]
|
|||
valhall_inc_dir = ['valhall']
|
||||
valhall_files = []
|
||||
|
||||
fifthgen_archs = [12, 13]
|
||||
fifthgen_archs = [12, 13, 14, 15]
|
||||
fifthgen_inc_dir = ['fifthgen']
|
||||
fifthgen_files = []
|
||||
|
||||
|
|
@ -83,7 +84,7 @@ jm_files = [
|
|||
'jm/panvk_vX_gpu_queue.c',
|
||||
]
|
||||
|
||||
csf_archs = [10, 12, 13]
|
||||
csf_archs = [10, 12, 13, 14, 15]
|
||||
csf_inc_dir = ['csf']
|
||||
csf_files = [
|
||||
'csf/panvk_vX_bind_queue.c',
|
||||
|
|
@ -126,7 +127,7 @@ common_per_arch_files = [
|
|||
sha1_h,
|
||||
]
|
||||
|
||||
foreach arch : [6, 7, 10, 12, 13]
|
||||
foreach arch : [6, 7, 10, 12, 13, 14, 15]
|
||||
per_arch_files = common_per_arch_files
|
||||
inc_panvk_per_arch = []
|
||||
|
||||
|
|
|
|||
|
|
@ -243,7 +243,7 @@ struct panvk_cmd_graphics_state {
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
#if PAN_ARCH >= 10
|
||||
#if PAN_ARCH >= 10 && PAN_ARCH < 14
|
||||
struct panvk_device_draw_context {
|
||||
struct panvk_priv_bo *fns_bo;
|
||||
uint64_t fn_set_fbds_provoking_vertex_stride;
|
||||
|
|
@ -376,8 +376,7 @@ cached_fs_required(ASSERTED const struct panvk_cmd_graphics_state *state,
|
|||
gfx_state_set_dirty(__cmdbuf, FS_PUSH_UNIFORMS); \
|
||||
} while (0)
|
||||
|
||||
|
||||
#if PAN_ARCH >= 10
|
||||
#if PAN_ARCH >= 10 && PAN_ARCH < 14
|
||||
VkResult
|
||||
panvk_per_arch(device_draw_context_init)(struct panvk_device *dev);
|
||||
|
||||
|
|
|
|||
|
|
@ -61,6 +61,12 @@ panvk_catch_indirect_alloc_failure(VkResult error)
|
|||
case 13: \
|
||||
panvk_arch_name(name, v13)(__VA_ARGS__); \
|
||||
break; \
|
||||
case 14: \
|
||||
panvk_arch_name(name, v14)(__VA_ARGS__); \
|
||||
break; \
|
||||
case 15: \
|
||||
panvk_arch_name(name, v15)(__VA_ARGS__); \
|
||||
break; \
|
||||
default: \
|
||||
UNREACHABLE("Unsupported architecture"); \
|
||||
} \
|
||||
|
|
@ -84,6 +90,12 @@ panvk_catch_indirect_alloc_failure(VkResult error)
|
|||
case 13: \
|
||||
ret = panvk_arch_name(name, v13)(__VA_ARGS__); \
|
||||
break; \
|
||||
case 14: \
|
||||
ret = panvk_arch_name(name, v14)(__VA_ARGS__); \
|
||||
break; \
|
||||
case 15: \
|
||||
ret = panvk_arch_name(name, v15)(__VA_ARGS__); \
|
||||
break; \
|
||||
default: \
|
||||
UNREACHABLE("Unsupported architecture"); \
|
||||
} \
|
||||
|
|
@ -102,6 +114,10 @@ panvk_catch_indirect_alloc_failure(VkResult error)
|
|||
#define panvk_per_arch(name) panvk_arch_name(name, v12)
|
||||
#elif PAN_ARCH == 13
|
||||
#define panvk_per_arch(name) panvk_arch_name(name, v13)
|
||||
#elif PAN_ARCH == 14
|
||||
#define panvk_per_arch(name) panvk_arch_name(name, v14)
|
||||
#elif PAN_ARCH == 15
|
||||
#define panvk_per_arch(name) panvk_arch_name(name, v15)
|
||||
#else
|
||||
#error "Unsupported arch"
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -64,6 +64,8 @@ PER_ARCH_FUNCS(7);
|
|||
PER_ARCH_FUNCS(10);
|
||||
PER_ARCH_FUNCS(12);
|
||||
PER_ARCH_FUNCS(13);
|
||||
PER_ARCH_FUNCS(14);
|
||||
PER_ARCH_FUNCS(15);
|
||||
|
||||
static VkResult
|
||||
create_kmod_dev(struct panvk_physical_device *device,
|
||||
|
|
@ -411,6 +413,8 @@ panvk_physical_device_init(struct panvk_physical_device *device,
|
|||
switch (arch) {
|
||||
case 6:
|
||||
case 7:
|
||||
case 14:
|
||||
case 15:
|
||||
if (!os_get_option("PAN_I_WANT_A_BROKEN_VULKAN_DRIVER")) {
|
||||
result = panvk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
|
||||
"WARNING: panvk is not well-tested on v%d, "
|
||||
|
|
|
|||
|
|
@ -239,10 +239,15 @@ get_frame_shader(struct panvk_device *dev,
|
|||
panvk_priv_mem_write_desc(shader->spd, 0, SHADER_PROGRAM, cfg) {
|
||||
cfg.stage = MALI_SHADER_STAGE_FRAGMENT;
|
||||
cfg.fragment_coverage_bitmask_type = MALI_COVERAGE_BITMASK_TYPE_GL;
|
||||
#if PAN_ARCH >= 15
|
||||
cfg.register_count = shader->info.work_reg_count;
|
||||
cfg.preload.r0_r15 = shader->info.preload;
|
||||
#else
|
||||
cfg.register_allocation =
|
||||
pan_register_allocation(shader->info.work_reg_count);
|
||||
cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem);
|
||||
cfg.preload.r48_r63 = shader->info.preload >> 48;
|
||||
#endif
|
||||
cfg.binary = panvk_priv_mem_dev_addr(shader->code_mem);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -550,7 +550,7 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device,
|
|||
goto err_free_precomp;
|
||||
}
|
||||
|
||||
#if PAN_ARCH >= 10
|
||||
#if PAN_ARCH >= 10 && PAN_ARCH < 14
|
||||
result = panvk_per_arch(device_draw_context_init)(device);
|
||||
if (result != VK_SUCCESS)
|
||||
goto err_free_mem_cache;
|
||||
|
|
@ -616,7 +616,7 @@ err_finish_queues:
|
|||
panvk_meta_cleanup(device);
|
||||
|
||||
err_free_draw_ctx:
|
||||
#if PAN_ARCH >= 10
|
||||
#if PAN_ARCH >= 10 && PAN_ARCH < 14
|
||||
panvk_per_arch(device_draw_context_cleanup)(device);
|
||||
err_free_mem_cache:
|
||||
#endif
|
||||
|
|
@ -679,7 +679,7 @@ panvk_per_arch(destroy_device)(struct panvk_device *device,
|
|||
}
|
||||
|
||||
panvk_precomp_cleanup(device);
|
||||
#if PAN_ARCH >= 10
|
||||
#if PAN_ARCH >= 10 && PAN_ARCH < 14
|
||||
panvk_per_arch(device_draw_context_cleanup)(device);
|
||||
#endif
|
||||
panvk_meta_cleanup(device);
|
||||
|
|
|
|||
|
|
@ -732,6 +732,18 @@ get_conformance_version()
|
|||
return (VkConformanceVersion){0, 0, 0, 0};
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
get_device_id(uint64_t gpu_id)
|
||||
{
|
||||
if (PAN_ARCH >= PAN_ID64_COMPAT)
|
||||
return ((PAN_ID64_COMPAT << 28) | (PAN_ID64_ARCH_MAJOR(gpu_id) << 20) |
|
||||
(PAN_ID64_ARCH_MINOR(gpu_id) << 12) |
|
||||
((PAN_ID64_PRODUCT_MAJOR(gpu_id) & 0xF) << 8) |
|
||||
((PAN_ID64_VERSION_MAJOR(gpu_id) & 0xF) << 4) |
|
||||
(PAN_ID64_VERSION_MINOR(gpu_id) & 0xF));
|
||||
return (gpu_id & 0xFFFFFFFF);
|
||||
}
|
||||
|
||||
void
|
||||
panvk_per_arch(get_physical_device_properties)(
|
||||
const struct panvk_instance *instance,
|
||||
|
|
@ -750,8 +762,17 @@ panvk_per_arch(get_physical_device_properties)(
|
|||
|
||||
const bool has_disk_cache = device->vk.disk_cache != NULL;
|
||||
|
||||
/* Calculate the value using register count on v15+.
|
||||
* TODO: As this requires register allocation changes ensuring we don't
|
||||
* violate the limits based on the workgroup size, clamp the value to half of
|
||||
* the max threads value (always safe and matches previous GPUs) for now. */
|
||||
unsigned max_threads_per_wg =
|
||||
(PAN_ARCH >= 15)
|
||||
? MIN2(pan_compute_max_thread_count(&device->kmod.dev->props, 32),
|
||||
device->kmod.dev->props.max_threads_per_core / 2)
|
||||
: device->kmod.dev->props.max_threads_per_wg;
|
||||
/* Ensure that the max threads count per workgroup is valid for Bifrost */
|
||||
assert(PAN_ARCH > 8 || device->kmod.dev->props.max_threads_per_wg <= 1024);
|
||||
assert(PAN_ARCH > 8 || max_threads_per_wg <= 1024);
|
||||
|
||||
float pointSizeRangeMin;
|
||||
float pointSizeRangeMax;
|
||||
|
|
@ -770,7 +791,7 @@ panvk_per_arch(get_physical_device_properties)(
|
|||
.driverVersion = vk_get_driver_version(),
|
||||
.vendorID =
|
||||
instance->force_vk_vendor ? instance->force_vk_vendor : ARM_VENDOR_ID,
|
||||
.deviceID = device->kmod.dev->props.gpu_id,
|
||||
.deviceID = get_device_id(device->kmod.dev->props.gpu_id),
|
||||
.deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU,
|
||||
|
||||
/* Vulkan 1.0 limits */
|
||||
|
|
@ -880,11 +901,9 @@ panvk_per_arch(get_physical_device_properties)(
|
|||
/* We could also split into serveral jobs but this has many limitations.
|
||||
* As such we limit to the max threads per workgroup supported by the GPU.
|
||||
*/
|
||||
.maxComputeWorkGroupInvocations =
|
||||
device->kmod.dev->props.max_threads_per_wg,
|
||||
.maxComputeWorkGroupSize = {device->kmod.dev->props.max_threads_per_wg,
|
||||
device->kmod.dev->props.max_threads_per_wg,
|
||||
device->kmod.dev->props.max_threads_per_wg},
|
||||
.maxComputeWorkGroupInvocations = max_threads_per_wg,
|
||||
.maxComputeWorkGroupSize = {max_threads_per_wg, max_threads_per_wg,
|
||||
max_threads_per_wg},
|
||||
/* 8-bit subpixel precision. */
|
||||
.subPixelPrecisionBits = 8,
|
||||
.subTexelPrecisionBits = 8,
|
||||
|
|
@ -1075,8 +1094,7 @@ panvk_per_arch(get_physical_device_properties)(
|
|||
.minSubgroupSize = pan_subgroup_size(PAN_ARCH),
|
||||
.maxSubgroupSize = pan_subgroup_size(PAN_ARCH),
|
||||
.maxComputeWorkgroupSubgroups =
|
||||
device->kmod.dev->props.max_threads_per_wg /
|
||||
pan_subgroup_size(PAN_ARCH),
|
||||
max_threads_per_wg / pan_subgroup_size(PAN_ARCH),
|
||||
.requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||
.maxInlineUniformBlockSize = MAX_INLINE_UNIFORM_BLOCK_SIZE,
|
||||
.maxPerStageDescriptorInlineUniformBlocks =
|
||||
|
|
|
|||
|
|
@ -1172,10 +1172,15 @@ panvk_shader_upload(struct panvk_device *dev,
|
|||
cfg.vertex_warp_limit = MALI_WARP_LIMIT_HALF;
|
||||
#endif
|
||||
|
||||
#if PAN_ARCH >= 15
|
||||
cfg.register_count = shader->info.work_reg_count;
|
||||
cfg.preload.r0_r15 = shader->info.preload;
|
||||
#else
|
||||
cfg.register_allocation =
|
||||
pan_register_allocation(shader->info.work_reg_count);
|
||||
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
|
||||
cfg.preload.r48_r63 = (shader->info.preload >> 48);
|
||||
#endif
|
||||
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
|
||||
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
|
||||
|
||||
if (cfg.stage == MALI_SHADER_STAGE_FRAGMENT)
|
||||
|
|
@ -1191,10 +1196,15 @@ panvk_shader_upload(struct panvk_device *dev,
|
|||
panvk_priv_mem_write_desc(shader->spds.all_points, 0, SHADER_PROGRAM,
|
||||
cfg) {
|
||||
cfg.stage = pan_shader_stage(&shader->info);
|
||||
#if PAN_ARCH >= 15
|
||||
cfg.register_count = shader->info.work_reg_count;
|
||||
cfg.preload.r0_r15 = shader->info.preload;
|
||||
#else
|
||||
cfg.register_allocation =
|
||||
pan_register_allocation(shader->info.work_reg_count);
|
||||
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
|
||||
cfg.preload.r48_r63 = (shader->info.preload >> 48);
|
||||
#endif
|
||||
cfg.binary = panvk_shader_variant_get_dev_addr(shader);
|
||||
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
|
||||
}
|
||||
|
||||
|
|
@ -1206,11 +1216,16 @@ panvk_shader_upload(struct panvk_device *dev,
|
|||
panvk_priv_mem_write_desc(shader->spds.all_triangles, 0, SHADER_PROGRAM,
|
||||
cfg) {
|
||||
cfg.stage = pan_shader_stage(&shader->info);
|
||||
#if PAN_ARCH >= 15
|
||||
cfg.register_count = shader->info.work_reg_count;
|
||||
cfg.preload.r0_r15 = shader->info.preload;
|
||||
#else
|
||||
cfg.register_allocation =
|
||||
pan_register_allocation(shader->info.work_reg_count);
|
||||
cfg.preload.r48_r63 = (shader->info.preload >> 48);
|
||||
#endif
|
||||
cfg.binary = panvk_shader_variant_get_dev_addr(shader) +
|
||||
shader->info.vs.no_psiz_offset;
|
||||
cfg.preload.r48_r63 = (shader->info.preload >> 48);
|
||||
cfg.flush_to_zero_mode = shader_ftz_mode(shader);
|
||||
}
|
||||
#else
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue